*! 1.2 Molly M. King 30 March 2022
 
 *1.1 initial release
 *1.2 update: Added functionality to deal with missing values

***-----------------------------***

capture program drop redi
program define redi

version 16

syntax varlist(min=2 max=2), ///
	Generate(name) [CPStype(string)] ///
	[INFlationyear(numlist integer >=1978 max=1)]
/*  
redi varlist(min=2 max=2), /// incvar year
	Generate(newvarname) [INFlation(integer 0)] ///
	[CPStype(string)] /// (household|family|respondent)
	[refvars(namelist min=0 max=2)] 	
*/

tempfile research_data
save `research_data'
	
***-----------------------------***
// # DEFINE ALL LOCALS

// Variable list
// categorical income variable name from research dataset
	local inc_cat_var : word 1 of `varlist'
// year variable name from research dataset
	local research_year : word 2 of `varlist'
	
	
// New variable	created for continuous income for research dataset
local new_inc_var "`generate'"


// Use CPS-ASEC as reference dataset
local reference_dataset "cps_reference.dta"
local ref_year "year" // Name of variable in CPS-ASEC, default reference  

// income_type specified by user for CPS-ASEC reference
if "`cpstype'" == "family" & `"`using'"' == "" {
	use `reference_dataset', clear  // Dataset placed in cwd by user			
	keep if pernum == 1 // Keeps only one person per household
	compress
	tempfile ref_data
	save `ref_data'
	local ref_income_var "ftotval" // Name in reference (CPS-ASEC) dataset
	di "Income type set as `cpstype' income; " ///
	   "Calculating family income using ftotval CPS-ASEC variable."
}
else if "`cpstype'" == "household" & `"`using'"' == ""  {
	use `reference_dataset', clear  // Dataset placed in cwd by user			
	keep if pernum == 1 // Keeps only one person per household
	compress
	tempfile ref_data
	save `ref_data'
	local ref_income_var "hhincome" // Name in reference (CPS-ASEC) dataset
	di "Income type set as `cpstype' income; " ///
	   "Calculating household income using hhincome CPS-ASEC variable."
}
else if "`cpstype'" == "respondent" & `"`using'"' == "" {
	use `reference_dataset', clear  // Dataset placed in cwd by user			
	keep if pernum == 1 // Keeps only one person per household
	compress
	tempfile ref_data
	save `ref_data'
	local ref_income_var "inctot" // Name in reference (ASEC) dataset
	di "Income type set as `cpstype' income; " ///
	   "Calculating respondent income using inctot ASEC variable."
}	
else if `"`cpstype'"' != "household" & `"`cpstype'"' != "family" & `"`csstype'"' != "respondent" {
	display as err "CPS income type must be either household, family, or respondent."
	error 999
}


// Inflation

local inflation_year `inflationyear'
di "Inflation year is `inflation_year'"

// Check if inflation year provided by user
capture assert `inflation_year' !=. // verifies expression is true
if _rc { // if return code, expression false (no inflation) 
	local inflate = "no"
} 
else if _rc == 0 { // If rc, expression true --> inflationyear empty/missing
	local inflate = "yes"	
	di "Get data: Inflation year is " `inflation_year'
	import excel "https://www.bls.gov/cpi/research-series/r-cpi-u-rs-allitems.xlsx", ///
		clear  cellrange(A6) firstrow
		
	// Rename columns
	keep YEAR AVG
	rename YEAR year
	rename AVG cpi_avg  // "CPI-R-US all items average cost price inflator"
	tempfile temp_cpi
	save `temp_cpi', replace

	keep if year == `inflation_year' // Keep only CPI data from year inflating to
	local avg_`inflation_year' = cpi_avg
	di "CPI average for inflation year is " `avg_`inflation_year''	

	use `temp_cpi', clear
	gen inflation_factor = .
	// Divide year inflating from / year inflating to
	replace inflation_factor = cpi_avg / `avg_`inflation_year''
	label var inflation_factor "Conversion factor - to convert to $`inf_year, divide by inflation_factor"
	save `temp_cpi', replace
}


***-----------------------------***
// # CONVERT CATEGORICAL TO CONTINUOUS INCOME VALUES

// Create local for levels of income = e.g., levelsof year, local(years)
use `research_data', clear

mvencode `inc_cat_var', mv(98)

levelsof `inc_cat_var', local("`inc_cat_var'_levels") 

// Create numeric variables indicating edges of income categories //
// Note: for more detail on if-command and Stata regular expressions used to create this, see:
	*http://www.stata.com/statalist/archive/2013-03/msg00654.html
	*http://www.stata.com/support/faqs/programming/if-command-versus-if-qualifier/
	*http://stats.idre.ucla.edu/stata/faq/how-can-i-extract-a-portion-of-a-string-variable-using-regular-expressions/

// List income levels
di "The income levels are: " "``inc_cat_var'_levels'"
		
// If variables are present either as strings OR as labels of categorical variables:
capture confirm string variable `inc_cat_var'
if !_rc {
	gen inc_decoded = `inc_cat_var' // action for string variable
}
else {
	// Decode converts labels to string variables for research dataset
	decode `inc_cat_var', gen(inc_decoded) // action for numeric variable
}

tempfile working_regex
save `working_regex', replace

// Levels of research year variable to loop through years in dataset
levelsof `research_year', local(years)
di  "Create local variable years to loop within that income level - values:" `years'

foreach y of local years { // loop through all years

	// Loop through all values of income categories (`inc_level')
	foreach inc_level of local `inc_cat_var'_levels {

		use `working_regex', clear
	
		// Keep data if income variable is equal to current income level (of loop)
		keep if `inc_cat_var' == `inc_level'  
		di "The current inc_level is: " `inc_level'

		// if `inc_cat_var' is missing, 
		if `inc_cat_var' == 98 | `inc_cat_var' == . {
			di "The inc_level " `inc_level' " is missing (recoded as 98)"
			// keep it missing for lower_bound and upper_bound
			gen `inc_cat_var'_lb = .
			gen `inc_cat_var'_ub = .
			di "Lower and upper bound of . created for inc_level " `inc_level'
			
			// complete the creation of the dataset will need later
			gen obs_no = .
			gen category_n = .
			tempfile temp_`inc_level'_`y'
			save `temp_`inc_level'_`y'', replace
			di "Saved REDI data (with continuous data) for the missing inc_level " ///
			   "for year `y' in file"
		}
		
		// For text at beginning of the string:
		else if regexm(inc_decoded, "^[a-z/A-Z]+") == 1 {
			di "The inc_level " `inc_level' " is at the lowest end of the " ///
			   "original Research dataset income range"
			destring inc_decoded, ///
				ignore("Less than LESS THAN Under,$ Lt") generate(`inc_cat_var'_ub) 
			gen `inc_cat_var'_lb = -100000
			di "Lower bound of -100,000 created for inc_level " `inc_level'
		}

		// For text at end of the string:
		else if regexm(inc_decoded, "[a-z/A-Z]+$") == 1 {
			di "The inc_level " `inc_level' " is at the highest end of the " ///
			   "original Research dataset income range"
			destring inc_decoded, ///
				ignore("and over or over,$ or more OR MORE") generate(`inc_cat_var'_lb) 
			gen `inc_cat_var'_ub = 999999  // my topcode for asec purposes
			di "Upper bound of 999999 created for inc_level " `inc_level'
		}

		// For labels with 2 numbers in them
		else if regexm(inc_decoded, "[0-9]+$") == 1 {
		// Since those at lowest and highest ranges have already been matched 
		// (using text), this leaves those with ranges
			di "The inc_level " `inc_level' " has a lower and an upper level"
			split inc_decoded, ///
				parse("-" "to" "-" "to under" "to less than" "UP TO" "TO" "but less than") ///
				ignore(" ,$") destring
			gen `inc_cat_var'_lb = inc_decoded1
			gen `inc_cat_var'_ub = inc_decoded2
			di "Lower bound of `inc_cat_var'_lb and upper bound of `inc_cat_var'_ub " ///
			   "created for inc_level " `inc_level' " of the original Research " ///
			   "dataset income range"
		}
		
		else { // Error - in case doesn't fit any existing category
			di as error "Error: The inc_level " `inc_level' ///
						" does not fit any of the existing regular expressions " ///
						"designs. Please email the REDI program creator at " ///
						"mollymkingphd@gmail.com with information about this " ///
						"error and a list of all of your income levels so she " ///
						"can update the program. Thank you for your help!"
			error 999
		}

		// MERGE RESEARCH AND REFERENCE DATA
		
		// Create locals to use later for selecting appropriate reference data bounds
		local upper_bound = `inc_cat_var'_ub
		local lower_bound = `inc_cat_var'_lb

		// Drop variables used in creating lower and upper bounds
		capture drop inc_decoded1 inc_decoded2

		// For nonmissing categories:
		if `inc_cat_var' != 98 & `inc_cat_var' != . {
			
			// Now, still within the single-income-level loop in research dataset:
			// summarize: count of how many individuals in that income level during year
			quietly summarize if `research_year' == `y' & `inc_cat_var' == `inc_level'
			local sample_size = r(N) // count how many rows there are in survey dataset
			 // count N_B for each bin 
			 // (# of observations in Research dataset income category between L_bn and U_bn)
					
			// Create count within each Research dataset income level
			gen `inc_cat_var'_n = `sample_size' // size of bin in Research dataset
			label variable `inc_cat_var'_n "N in income level in research dataset"

			// Create temporary file of just this Research dataset income level and year 
			// can merge reference/CPS-ASEC incomes back into later
			keep if `research_year' == `y' & `inc_cat_var' == `inc_level'
			gen id = _n
			tempfile premerge_`inc_level'_`y'
			save `premerge_`inc_level'_`y'', replace
			di "Saved tempfile with upper and lower bounds of research data income levels"
			
			// Keep reference/CPS-ASEC data if within income bounds and for given year
			use `ref_data', clear
			di "Reference data open - Income type set as `cpstype' income" 
			
			keep if `ref_year' == `y' & ///
				`ref_income_var' >= `lower_bound' & ///
				`ref_income_var' <= `upper_bound'
			gen obs_no = _n
			di "Observation number generated"
		
			tempfile ref_`y'_`inc_level'
			save `ref_`y'_`inc_level'', replace
		
			di "Keep income between $`lower_bound' and $`upper_bound' for `y' year for ASEC."
			
			// Take random draw of number of incomes w/in income boundary, year
			// Sample, with replacement, such that ASEC income has an equal probability 
			// of being assigned to each observation, creting sample of size equal
			// to research dataset for income level, year
			// Thanks to Clyde Schechter:
				// https://www.statalist.org/forums/forum/general-stata-discussion/general/ 	
				// 1475890-is-there-a-command-that-is-equivalent-to-bsample-more-than-_n
			quietly des 
			local N = r(N)
			clear
			set obs `sample_size'
			gen obs_no = runiformint(1, `N')
			merge m:1 obs_no using `ref_`y'_`inc_level'', ///
				keep(match) nogenerate 
			save `ref_`y'_`inc_level'', replace
			di "Completed random draw of `N' incomes from ASEC between $" ///
				" `lower_bound' and $`upper_bound' (inc level `inc_level' )"
				
			// Merge Research and random sample of reference (ASEC) data within incomebin, year
			gen id = _n
			merge 1:1 id using `premerge_`inc_level'_`y'', ///
				keep(match)
			di "Merged ASEC values with original Research dataset for inc_level " ///
			   `inc_level' " ($`lower_bound'-`upper_bound') and year `y'."
					
			// Create count within each REDI income level
			generate 	category_n = .
			replace 	category_n = `N' // r(N) from above
			label var 	category_n "N respondents in income level in post-merge dataset'"

			// Save these new REDI data (with yearly ASEC continuous data) 
			// in a tempfile we can use later for merging, etc.
			tempfile temp_`inc_level'_`y'
			save `temp_`inc_level'_`y'', replace
			di "Saved REDI data (with continuous data) for inc_level " ///
			   `inc_level' " ($`lower_bound'-`upper_bound') and year `y' in file"
		
		} 	// close conditional for nonmissing categories
			
	}	/// Close loop through all values of income categories (`inc_level')
	
	// Append all income levels across all years
	tokenize ``inc_cat_var'_levels'
	local first `1'
	use `temp_`1'_`y'', clear
	macro shift
	local rest `*'

	// Loop through and append each temp file created within income level loop
	foreach bracket in `*' {
		append using `temp_`bracket'_`y''
	}

	// Create new tempfile to save each year's data for all income levels
	tempfile temp_`y'
	save `temp_`y'', replace
	di "Saved new tempfile for year `y' for all REDI  income levels."
						
} /// End of loop through all years
				
// Append all years 
tokenize `years'
local first `1'
use `temp_`1'', clear
macro shift
local rest `*'

// Loop through and append each yearly temp file created earlier
foreach ytemp in `*' {
	append using `temp_`ytemp''
}

di "Appended all years"

***-----------------------------***
// # RENAME FINAL VARIABLES
gen `new_inc_var' = `ref_income_var'
format `new_inc_var' %6.0fc
label var `new_inc_var' "REDI continuous `inc_cat_var' income"

// # CLEAN UP
drop obs_no id inc_decoded category_n `inc_cat_var'_n
drop _merge
capture drop hhincome ftotval inctot
mvdecode `inc_cat_var', mv(98) /// change 98 values back to missing values


***-----------------------------***

// # INFLATE based on specified inflation_year
if "`inflate'" == "yes" { // if inflation year is not empty	
di "INFLATE: Inflation year is " `inflation_year'
	merge m:1 year using `temp_cpi'
	
	// INFLATION-ADJUSTED INCOME = divide continuous income variable by conversion factor
	// original income variable, adjusted for inflation
	gen `new_inc_var'_inf`inflation_year' = `new_inc_var' / inflation_factor
	format `new_inc_var'_inf`inflation_year' %6.0fc
	label var `new_inc_var'_inf`inflation_year' ///
		"REDI continuous inflation-adjusted `inc_var' income, `inflation_year' dollars"

	// Drop inflation data to clean up
	drop if _merge == 2
	drop cpi_avg  inflation_factor _merge

}

***-----------------------------***

end