*! bfmsvycorr v0.0.0.9000 Thomas Blanchet, Ignacio Flores, Marc Morgan program bfmcorr, eclass version 11 syntax using/, Weight(varname) INCome(varname) HOUseholds(varname) TAXUnit(string) /// [TAXINCome(varname) /// HOLDMARgins(varlist) VARMARgins(varlist) FREQMARgins(numlist >0 <1) /// INCOMECOMPosition(string) INCOMEPOPulation(string) /// TRUSTstart(string) MERGingpoint(string) /// TAXPerc(name) TAXThr(name) TAXAvg(name) THETALIMit(real 5) MINBracket(real 10) /// NOREPlace Knn(real 10) SAMPletop(real 0.1) /// SLope(real -1) PEnalization(real 20)] // ---------------------------------------------------------------------- // // Check validity of the input data and arguments // ---------------------------------------------------------------------- // // Tax data file exists confirm file "`using'" if ("`incomecomposition'" != "") { confirm file "`incomecomposition'" } if ("`incomepopulation'" != "") { confirm file "`incomepopulation'" } // No weights missing capture assert !missing(`weight') if (_rc) { display as error "weights must be nonmissing" exit 416 } // No weights below one capture assert (`weight' >= 1) if (_rc) { display as error "weights must be greater or equal to one", _continue display as error "(they should sum to the population size)" exit 498 } // The tax unit is either individuals or households if !inlist(substr("`taxunit'", 1, 1), "i", "h") { display as error "option 'taxunit' incorrectly specified:", _continue display as error "must start with 'i' (for individuals) or 'h' (for households)" exit 198 } // We have a value for all the new margins if ("`varmargins'" != "") { foreach v of varlist `varmargins' { quietly levelsof `v', local(values) clean if ("`values'" != "0 1") { display as error "the option varmargins(...) only supports", _continue display as error "dummy variables (equal to 0 or 1) and must", _continue display as error "take both values at least once" exit 450 } } local nb_varmargins: word count `varmargins' local nb_freqmargins: word count `freqmargins' if (`nb_varmargins' != `nb_freqmargins') { display as error "the options varmargins(...) and", _continue display as error "freqmargins(...) must have the same size" exit 198 } } // Tax unit is valid if (substr("`taxunit'", 1, 1) == "i") { local taxunit "i" } else if (substr("`taxunit'", 1, 1) == "h") { local taxunit "h" } else { display as error "option taxunit(...) incorrectly specified:", _continue display as error "must start with 'i' (for individals) or 'h' (for households)" exit 198 } // Weights and income variables are consistent with the statistical unit sort `households' `weight' tempvar nvals_weight quietly by `households' `weight': generate `nvals_weight' = (_n == 1) quietly by `households': replace `nvals_weight' = sum(`nvals_weight') quietly by `households': replace `nvals_weight' = `nvals_weight'[_N] capture assert `nvals_weight' == 1 if (_rc != 0) { display as error "weights not constant within households" exit 407 } if ("`taxunit'" == "h") { sort `households' `income' tempvar nvals_income quietly by `households' `income': generate `nvals_income' = (_n == 1) quietly by `households': replace `nvals_income' = sum(`nvals_income') quietly by `households': replace `nvals_income' = `nvals_income'[_N] capture assert `nvals_income' == 1 if (_rc != 0) { display as error "you specified the option taxunit(h), yet `income'", _continue display as error "is not constant within households" exit 459 } drop `nvals_income' if ("`taxincome'" != "") { sort `households' `taxincome' tempvar nvals_income quietly by `households' `taxincome': generate `nvals_income' = (_n == 1) quietly by `households': replace `nvals_income' = sum(`nvals_income') quietly by `households': replace `nvals_income' = `nvals_income'[_N] capture assert `nvals_income' == 1 if (_rc != 0) { display as error "you specified the option taxunit(h), yet `taxincome'", _continue display as error "is not constant within households" exit 459 } drop `nvals_income' } } // Use income as tax income if taxincome() not specified if ("`taxincome'" == "") { local taxincome `income' } // Trust region and merging point if ("`mergingpoint'" == "" | "`mergingpoint'" == "_auto") { if ("`truststart'" == "") { display as error "you must either specify the trust region of your tax data", _continue display as error "or specify the merging point directly" exit 198 } confirm number `truststart' if (_rc != 0) { display as error "option truststart(...) must be a number" exit 198 } if (`truststart' <= 0 | `truststart' >= 1) { display as error "option trustregion(...) incorrectly specified:", _continue display as error "must be between zero and one (excluded)" exit 198 } } else { capture confirm number `mergingpoint' if (_rc != 0) { display as error "option mergingpoint(...) must be a number" exit 198 } if (`mergingpoint' <= 0 | `mergingpoint' >= 1) { display as error "option mergingpoint(...) incorrectly specified:", _continue display as error "must be between zero and one (excluded)" exit 198 } local truststart `mergingpoint' } // ---------------------------------------------------------------------- // // Check that variables to be added do not already exist // ---------------------------------------------------------------------- // foreach v in _correction _weight _pid _hid _factor { capture confirm `v' if (_rc == 0) { display as error "variable {bf:`v'} already defined" exit 110 } } // ---------------------------------------------------------------------- // // Store data for initial Lorenz (cf. postestimation commands) // ---------------------------------------------------------------------- // preserve tempvar freq_old F_old fy_old cumfy_old L_old poptot ftile_old sort `income' quietly sum `weight', meanonly quietly gen `freq_old' = `weight'/r(sum) quietly gen `F_old' = sum(`freq_old') quietly gen `fy_old' = `freq_old'*`income' quietly gen `cumfy_old' = sum(`fy_old') quietly sum `cumfy_old' local cumfy_max = r(max) quietly gen `L_old' = `cumfy_old'/`cumfy_max' quietly egen `ftile_old' = cut(`F_old'), at(0.00(0.01)0.99 0.991(0.001)0.999 0.9991(0.0001)1) quietly collapse (min) `L_old', by(`ftile_old') tempname mat_lorenz_old mkmat `ftile_old' `L_old', matrix(`mat_lorenz_old') mat colnames `mat_lorenz_old' = "ftile_old" "L_old" restore // ---------------------------------------------------------------------- // // Store data for summarizing initial data (cf. postestimation commands) // ---------------------------------------------------------------------- // preserve tempvar freq F fy cumfy L d_eq bckt_size trunc_avg ftile wy // Total average sort `income' quietly sum `income' [w=`weight'], meanonly local old_avg = r(mean) local y_max_old=r(max) //Gini quietly sum `weight', meanonly local poptot = r(sum) quietly gen `freq' = `weight'/`poptot' quietly gen `F' = sum(`freq') quietly gen `fy' = `freq'*`income' quietly gen `cumfy' = sum(`fy') quietly sum `cumfy', meanonly local cumfy_max = r(max) quietly gen `L' = `cumfy'/`cumfy_max' quietly gen `d_eq' = (`F'-`L')*`weight'/`poptot' quietly sum `d_eq', meanonly local d_eq_tot = r(sum) local gini = `d_eq_tot'*2 // Classify obs in 127 g-percentiles quietly egen `ftile' = cut(`F'), at(0(0.01)0.99 0.991(0.001)0.999 0.9991(0.0001)0.9999 0.99991(0.00001)0.99999 1) // Top average gsort -`F' quietly gen `wy' = `income'*`weight' quietly gen topavg_old = sum(`wy')/sum(`weight') sort `F' // Interval thresholds quietly collapse (min) thr_old = `income' (mean) bckt_avg_old = `income' (min) topavg_old [w=`weight'], by (`ftile') sort `ftile' quietly gen ftile = `ftile' // Generate 127 percentiles from scratch tempfile collapsed_sum quietly save "`collapsed_sum'" clear quietly set obs 127 quietly gen ftile = (_n - 1)/100 in 1/100 quietly replace ftile = (99 + (_n - 100)/10)/100 in 101/109 quietly replace ftile = (99.9 + (_n - 109)/100)/100 in 110/118 quietly replace ftile = (99.99 + (_n - 118)/1000)/100 in 119/127 quietly merge n:1 ftile using "`collapsed_sum'" // Interpolate missing info quietly ipolate bckt_avg_old ftile, gen(bckt_avg2_old) quietly ipolate thr_old ftile, gen(thr2_old) quietly ipolate topavg_old ftile, gen(topavg2_old) sort ftile drop bckt_avg_old thr_old topavg_old quietly rename bckt_avg2_old bckt_avg_old quietly rename thr2_old thr_old quietly rename topavg2_old topavg_old quietly sum bckt_avg_old, meanonly quietly replace bckt_avg_old = r(max) if missing(bckt_avg_old) quietly sum thr_old, meanonly quietly replace thr_old = r(max) if missing(thr_old) quietly sum topavg_old, meanonly quietly replace topavg_old = r(max) if missing(topavg_old) // Top shares quietly replace ftile = round(ftile, 0.00001) quietly gen topshare_old = (topavg_old/`old_avg')*(1 - ftile) // Total average quietly gen average_old = . quietly replace average_old = `old_avg' in 1 // Inverted beta coefficient quietly gen b_old = topavg_old/thr_old // Fractile quietly gen p_old = round(ftile, 0.00001) // Gini quietly gen gini_old = `gini' //Order and save order gini_old p_old thr_old average_old bckt_avg_old topavg_old topshare_old b_old keep gini_old p_old thr_old average_old bckt_avg_old topavg_old topshare_old b_old tempname mat_sum_old mkmat gini_old average_old p_old thr_old bckt_avg_old topavg_old topshare_old b_old, matrix(`mat_sum_old') restore // ---------------------------------------------------------------------- // // Import and store the tax data // ---------------------------------------------------------------------- // // Get population size as the sum of weights quietly summarize `weight', meanonly local ind_popsize = r(sum) sort `households' tempvar hhmemno quietly by `households': generate `hhmemno' = _n quietly summarize `weight' if (`hhmemno' == 1), meanonly local hh_popsize = r(sum) drop `hhmemno' if ("`taxunit'" == "i") { local popsize = `ind_popsize' } else { local popsize = `hh_popsize' } // Get the name of the variables in the tax data if ("`taxperc'" == "") { local taxperc "p" } if ("`taxthr'" == "") { local taxthr "thr" } if ("`taxavg'" == "") { local taxavg "bracketavg" } tempfile rawdata rawtaxdata taxdata quietly save "`rawdata'" clear // Identify the format of the tax data if (substr("`using'", -4, .) == ".xls" | substr("`using'", -5, .) == ".xlsx") { // Excel format quietly import excel using "`using'", firstrow keep `taxperc' `taxthr' `taxavg' } else if (substr("`using'", -4, .) == ".csv") { // CSV format quietly import delimited using "`using'", varnames(1) keep `taxperc' `taxthr' `taxavg' } else { // Stata format quietly use `taxperc' `taxthr' `taxavg' using "`using'" } // Rename tax data variables to avoid conflicts tempvar new_taxperc new_taxthr new_taxavg rename `taxperc' `new_taxperc' rename `taxthr' `new_taxthr' rename `taxavg' `new_taxavg' local taxperc `new_taxperc' local taxthr `new_taxthr' local taxavg `new_taxavg' // Drop values missing or below trust region quietly drop if missing(`taxperc') | missing(`taxthr') quietly drop if (`taxperc' < `truststart') quietly save "`rawtaxdata'", replace keep `taxperc' `taxthr' // Check that there are some tax data left quietly count local nbrackets = r(N) if (`nbrackets' == 0) { display as error "no tax data within trust region" quietly use "`rawdata'", clear exit 2000 } // Check that the tax data is valid sort `taxperc' capture assert (`taxthr'[_n + 1] > `taxthr') if (_n < _N) if (_rc) { display as error "tax thresholds must be increasing" use "`rawdata'", clear exit 498 } // Store the list of fractiles and thresholds for later quietly levelsof `taxperc', local(list_taxperc) quietly levelsof `taxthr', local(list_taxthr) // Calculate population in each bracket tempvar bracketsize generate `bracketsize' = `popsize'*(cond(missing(`taxperc'[_n + 1]), 1, `taxperc'[_n + 1]) - `taxperc') // Create index for brackets tempvar bracket_index generate `bracket_index' = _n - 1 // Keep highest bracket in memory quietly summarize `taxthr', meanonly local max_taxthr = r(max) quietly summarize `bracket_index', meanonly local max_taxindex = r(max) // Save the tax data quietly save "`taxdata'" // ---------------------------------------------------------------------- // // Match survey data with tax brackets // ---------------------------------------------------------------------- // display as text "Matching survey observations with tax brackets..." // Import the survey data back use "`rawdata'", clear // If households are the tax unit, collapse the data to get that proper unit if ("`taxunit'" == "h") { if ("`income'" != "`taxincome'") { collapse (firstnm) `weight' (firstnm) `taxincome' (firstnm) `income', by(`households') } else { collapse (firstnm) `weight' (firstnm) `income', by(`households') } } tempfile data_collapsed quietly save "`data_collapsed'" // Calculate the survey CDF gsort -`taxincome' tempvar surveycdf quietly generate `surveycdf' = 1 - sum(`weight')/`popsize' sort `taxincome' quietly egen `bracket_index' = cut(`taxincome'), at(`list_taxthr') icodes // We need to deal separately with survey observations in the last bracket quietly replace `bracket_index' = `max_taxindex' if (`taxincome' >= `max_taxthr') tempfile rawdata_collapsed quietly save "`rawdata_collapsed'" quietly drop if missing(`bracket_index') quietly count if (r(N) <= 5) { display as error "not enough survey observations matching tax data trust region" exit 459 } // Calculate population in each bracket tempvar svybsize svynobs taxnobs collapse (sum) `svybsize' = `weight' (min) `surveycdf' = `surveycdf' (count) `taxnobs' = `taxincome' (count) `svynobs' = `income', by(`bracket_index') // Merge with the tax data quietly merge n:1 `bracket_index' using "`taxdata'", nogenerate keep(match using) keepusing(`bracketsize' `taxperc' `taxthr') quietly replace `taxnobs' = 0 if missing(`taxnobs') quietly replace `svynobs' = 0 if missing(`svynobs') quietly replace `svybsize' = 0 if missing(`svybsize') quietly replace `surveycdf' = 1 if missing(`surveycdf') quietly save "`taxdata'", replace // ---------------------------------------------------------------------- // // Determine merging point // ---------------------------------------------------------------------- // if ("`mergingpoint'" == ""| "`mergingpoint'" == "auto") { di as txt "Searching merging point..." quietly use "`taxdata'" // Calculate (small) theta coefficient tempvar small_theta quietly generate `small_theta' = `svybsize'/`bracketsize' // Calculate (big) theta coefficient tempvar big_theta quietly generate `big_theta' = `surveycdf'/`taxperc' tempvar small_theta_smooth quietly generate `small_theta_smooth' = . sort `taxperc' tempvar iso_wgt quietly generate `iso_wgt' = cond(missing(`taxperc'[_n + 1]), 1, `taxperc'[_n + 1]) - `taxperc' mata: a = . mata: w = . mata: st_view(a, ., st_local("small_theta")) mata: st_view(w, ., st_local("iso_wgt")) mata: y = isotonic_pava(w, a, 1) mata: st_store(., st_local("small_theta_smooth"), y) // Find merging point gsort -`taxperc' tempvar region quietly generate `region' = (sum(`small_theta_smooth' > `big_theta') == 0) sort `taxperc' quietly count if `region' if (r(N) == 0) { display as error "relative bias is greater than one at the top: nothing to correct" exit 459 } quietly count if !`region' if (r(N) > 0) { // If `small_theta_smooth' is constant and >= 1 over an interval, use the value corresponding // to the highest percentile summarize `small_theta_smooth' if `region', meanonly summarize `taxperc' if (`small_theta_smooth' == r(max)), meanonly quietly replace `region' = 0 if (`taxperc' < r(max)) & (`small_theta_smooth' >= 1) quietly summarize `taxperc' if `region' local mergingpoint = r(min) quietly summarize `bracket_index' if `region' local mergingindex = r(min) quietly summarize `taxthr' if `region' local mergingthr = r(min) // Store for potential postestimation command tempvar extrapol quietly generate `extrapol' = 0 tempname theta mkmat `taxperc' `big_theta' `small_theta' `small_theta_smooth' `extrapol' `taxthr', matrix(`theta') matrix colnames `theta' = "p" "big_theta" "small_theta" "small_theta_iso" "extrapol" "thr" display as text "Merging point found within trust region: p =", _continue display as text round(`mergingpoint', 1e-5) ",", _continue display as text "`taxincome' = " round(`mergingthr', 1) } else { display as text "Merging point not found within trust region. Extrapolating..." // Store the points at which the tabulation starts quietly summarize `taxperc', meanonly local min_taxperc = r(min) quietly summarize `taxthr', meanonly local min_taxthr = r(min) quietly save "`taxdata'", replace // Regroup brackets to make sure there is at least one observation in each of them quietly count if (`taxnobs' == 0) while (r(N) > 0) { gsort -`taxperc' tempvar newbracket queue quietly generate `queue' = sum(`taxnobs' == 0) // We choose to group the bracket with the one just below quietly generate `newbracket' = `bracket_index'[_n + 1] if (`taxnobs' == 0) // We have to make an exception for the last bracket, which we group with the one just above quietly replace `newbracket' = `bracket_index'[_n - 1] if (`taxnobs' == 0) & (_n == _N) quietly replace `bracket_index' = `newbracket' if (`queue' == 1) & (`taxnobs' == 0) collapse (sum) `svybsize' (min) `surveycdf' (sum) `taxnobs' /// (sum) `bracketsize' (min) `taxperc' (min) `taxthr', by(`bracket_index') quietly count if (`taxnobs' == 0) } sort `taxperc' tempvar log_small_theta log_thr extrapol_wgt quietly generate `log_small_theta' = log(`svybsize'/`bracketsize') quietly generate `log_thr' = log(`taxthr') quietly generate `extrapol_wgt' = cond(_n == _N, 1, `taxperc'[_n + 1]) - `taxperc' // Perform the ridge regression mata: y = . mata: st_view(y, ., st_local("log_small_theta")) mata: x = . mata: st_view(x, ., st_local("log_thr")) mata: w = . mata: st_view(w, ., st_local("extrapol_wgt")) mata: b = ridge_regression(y, x, w, strtoreal(st_local("slope")), strtoreal(st_local("penalization"))) mata: sigma = mean((y :- b[1] :- x:*b[2]):^2, w) mata: st_local("sigma", strofreal(sigma)) tempname beta_ridge mata: st_matrix(st_local("beta_ridge"), b) display as text "Elasticity of bias: " %4.0g `beta_ridge'[2, 1] if (`beta_ridge'[2, 1] >= 0) { display as error "invalid (nonnegative) elasticity estimated; check your data and try increasing penalization" exit 322 } // Extrapolate theta based on the results of the regression quietly use "`rawdata_collapsed'", clear tempvar fitted_theta log_income quietly generate `log_income' = log(`taxincome') quietly generate `fitted_theta' = . mata: new_x = . mata: st_view(new_x, ., st_local("log_income")) mata: st_store(., st_local("fitted_theta"), exp(b[1] :+ new_x:*b[2] :+ sigma/2)) // Merge with the tax data quietly merge n:1 `bracket_index' using "`taxdata'", nogenerate keepusing(`svybsize' `bracketsize' `taxperc' `taxthr') // For the observations above the trustable span, we use the observed value of theta quietly generate `small_theta' = `svybsize'/`bracketsize' if !missing(`bracket_index') quietly replace `small_theta' = `fitted_theta' if missing(`bracket_index') // Calculate corrected survey CDF tempvar fitted_weight fitted_cdf gsort -`taxincome' quietly generate `fitted_weight' = `weight'/`small_theta' quietly generate `fitted_cdf' = 1 - sum(`fitted_weight')/`popsize' // We only need the data below the trustable span for the extrapolation quietly drop if !missing(`bracket_index') // Create pseudo tax brackets based on the corrected survey tempvar pseudobracket quietly egen `pseudobracket' = cut(`fitted_cdf'), /// at(0(0.01)0.99 0.991(0.001)0.999 0.9991(0.0001)0.9999 0.99991(0.00001)0.99999) collapse (sum) `fitted_weight' (sum) `weight' (min) `surveycdf' (min) `taxincome' (count) `taxnobs' = `taxincome' (count) `svynobs' = `income', by(`pseudobracket') // Group again in case of identical thresholds collapse (sum) `fitted_weight' (sum) `weight' (min) `surveycdf' (min) `pseudobracket' (sum) `taxnobs' (sum) `svynobs', by(`taxincome') quietly drop if (`pseudobracket' >= `truststart') quietly generate `small_theta' = `weight'/`fitted_weight' quietly generate `small_theta_smooth' = `weight'/`fitted_weight' quietly generate `big_theta' = `surveycdf'/`pseudobracket' // Add the tax data for trustable span rename `pseudobracket' `taxperc' rename `taxincome' `taxthr' rename `fitted_weight' `bracketsize' rename `weight' `svybsize' append using "`taxdata'" sort `taxperc' // Store for potential postestimation command tempvar extrapol quietly generate `extrapol' = (`taxperc' < `truststart') tempname theta mkmat `taxperc' `big_theta' `small_theta' `small_theta_smooth' `extrapol' `taxthr', matrix(`theta') matrix colnames `theta' = "p" "big_theta" "small_theta" "small_theta_iso" "extrapol" "thr" // Find merging point tempvar region quietly generate `region' = (`small_theta_smooth' <= `big_theta') quietly count if `region' if (r(N) == 0) { display as error "relative bias is greater than one at the top: nothing to correct" exit 459 } quietly count if `region' if (r(N) > 0) { quietly summarize `taxperc' if `region' local mergingpoint = r(min) quietly summarize `bracket_index' if `region' local mergingindex = r(min) quietly summarize `taxthr' if `region' local mergingthr = r(min) display as text "Merging point found by extrapolation: p =", _continue display as text round(`mergingpoint', 1e-5) ",", _continue display as text "`taxincome' = " `mergingthr' quietly keep if `region' sort `taxperc' quietly replace `bracket_index' = _n - 1 quietly save "`taxdata'", replace } } } else { quietly use "`taxdata'" // Calculate (small) theta coefficient tempvar small_theta quietly generate `small_theta' = `svybsize'/`bracketsize' // Calculate (big) theta coefficient tempvar big_theta quietly generate `big_theta' = `surveycdf'/`taxperc' tempvar small_theta_smooth quietly generate `small_theta_smooth' = . sort `taxperc' mata: a = . mata: w = . mata: st_view(a, ., st_local("small_theta")) mata: st_view(w, ., st_local("svynobs")) mata: y = isotonic_pava(w, a, 1) mata: st_store(., st_local("small_theta_smooth"), y) // Store for potential postestimation command tempvar extrapol quietly generate `extrapol' = 0 tempname theta mkmat `taxperc' `big_theta' `small_theta' `small_theta_smooth' `extrapol' `taxthr', matrix(`theta') matrix colnames `theta' = "p" "big_theta" "small_theta" "small_theta_iso" "extrapol" "thr" tempvar region quietly generate `region' = (`taxperc' >= `mergingpoint') quietly summarize `taxperc' if `region' local mergingpoint = r(min) quietly summarize `bracket_index' if `region' local mergingindex = r(min) quietly summarize `taxthr' if `region' local mergingthr = r(min) display as text "Using user-specified merging point: p =", _continue display as text round(`mergingpoint', 1e-5) ",", _continue display as text "`taxincome' = " `mergingthr' } // ---------------------------------------------------------------------- // // Regroup tax brackets for calibration // ---------------------------------------------------------------------- // quietly use "`taxdata'", clear quietly keep if (`taxperc' >= `mergingpoint') quietly count local nbrackets = r(N) // Regroup brackets to make sure we dont multiply weights by more than `thetalimit' tempvar s_theta cns_violated quietly generate `s_theta' = `svybsize'/`bracketsize' quietly generate `cns_violated' = (1/`s_theta' > `thetalimit' | `s_theta' < 1/`thetalimit' | `svynobs' < `minbracket') quietly count if `cns_violated' while (r(N) > 0) { gsort -`taxperc' tempvar newbracket queue quietly generate `queue' = sum(`cns_violated') // We choose to group the bracket with the one just below quietly generate `newbracket' = `bracket_index'[_n + 1] if `cns_violated' // We have to make an exception for the last bracket, which we group with the one just above quietly replace `newbracket' = `bracket_index'[_n - 1] if (`cns_violated') & (_n == _N) quietly replace `bracket_index' = `newbracket' if (`queue' == 1) & (`cns_violated') collapse (sum) `svybsize' (min) `surveycdf' (sum) `taxnobs' (sum) `svynobs' /// (sum) `bracketsize' (min) `taxperc' (min) `taxthr', by(`bracket_index') quietly generate `s_theta' = `svynobs'/`taxnobs' quietly generate `cns_violated' = (1/`s_theta' > `thetalimit' | `s_theta' < 1/`thetalimit' | `svynobs' < `minbracket') quietly count if `cns_violated' } // Check if there is enough density in survey to respect `thetalimit' constraint quietly sum `svynobs' local totsvynobs = r(sum) quietly sum `taxnobs' local totataxnobs = r(sum) if (`totsvynobs'/`totataxnobs' < 1/`thetalimit') { di "The constraint for the maximum value of 1/theta is unsuited for the survey data" exit 459 } sort `taxperc' quietly replace `bracket_index' = _n - 1 quietly save "`taxdata'", replace quietly levelsof `taxthr', local(list_taxthr) quietly levelsof `bracket_index', local(list_bracketindex) quietly summarize `bracket_index', meanonly local max_taxindex = r(max) // Keep highest bracket in memory quietly summarize `taxthr', meanonly local max_taxthr = r(max) quietly summarize `bracket_index', meanonly local max_taxindex = r(max) // Get the new list of tax brackets sort `taxperc' local list_taxperc "" local list_taxthr "" forvalues j = 1/`=_N' { local list_taxperc `list_taxperc' `=`taxperc'[`j']' local list_taxthr `list_taxthr' `=`taxthr'[`j']' local list_svynobs `list_svynobs' `=`taxnobs'[`j']' } quietly count local newnbrackets = r(N) // Display a summary of the tax data display "" display as text _column(4) "Tax and survey data summary" display as text _column(4) "{hline 60}" display as text _column(15) "Fractile", _continue display as text _column(35) "Threshold", _continue display as text _column(59) "# obs" display as text _column(4) "{hline 60}" if (`newnbrackets' > 10) { forvalues i = 1/5 { local dip: word `i' of `list_taxperc' local dithr: word `i' of `list_taxthr' local diobs: word `i' of `list_svynobs' display as res _column(3) %20.5g `dip', _continue display as res %20.5gc `dithr', _continue display as res %19.5gc `diobs' } display as text _column(20) "..." _column(41) "..." _column(61) "..." forvalues i = 5(-1)1 { local j = `newnbrackets' - `i' + 1 local dip: word `j' of `list_taxperc' local dithr: word `j' of `list_taxthr' local diobs: word `j' of `list_svynobs' display as res _column(3) %20.5g `dip', _continue display as res %20.5gc `dithr', _continue display as res %19.5gc `diobs' } } else { forvalues i = 1/`newnbrackets' { local dip: word `i' of `list_taxperc' local dithr: word `i' of `list_taxthr' local diobs: word `i' of `list_svynobs' display as res _column(3) %20.5g `dip', _continue display as res %20.5gc `dithr', _continue display as res %19.5gc `diobs' } } display as text _column(4) "{hline 60}" display as text _column(4) "Number of brackets =", _continue display as res _column(55) %9.0f `newnbrackets' display as text _column(4) "{hline 60}" local regrouped = `nbrackets' - `newnbrackets' display as text _column(4) "`regrouped' bracket(s) grouped", _continue display as text "to avoid expanding weights more than `thetalimit' times per bracket." // Store population over merging point in survey (e.g. post-estimation) quietly use "`rawdata'", clear quietly sum `weight' if `income' > `mergingthr' local above_MP_svy = r(sum)/`popsize' // Use the new brackets for the survey data tempvar bracket_index_svy quietly egen `bracket_index' = cut(`taxincome'), at(`list_taxthr') icodes quietly egen `bracket_index_svy' = cut(`income'), at(`list_taxthr') icodes // We need to deal separately with survey observations in the last bracket quietly replace `bracket_index' = `max_taxindex' if (`taxincome' >= `max_taxthr') quietly replace `bracket_index_svy' = `max_taxindex' if (`income' >= `max_taxthr') quietly save "`rawdata'", replace // ---------------------------------------------------------------------- // // Import data for calibration on income composition, if any // ---------------------------------------------------------------------- // local usecomp 0 if ("`incomecomposition'" != "") { // Identify the format of the tax data if (substr("`incomecomposition'", -4, .) == ".xls" | substr("`incomecomposition'", -5, .) == ".xlsx") { // Excel format quietly import excel using "`incomecomposition'", firstrow clear } else if (substr("`incomecomposition'", -4, .) == ".csv") { // CSV format quietly import delimited using "`incomecomposition'", varnames(1) clear } else { // Stata format quietly use "`incomecomposition'", clear } capture confirm variable thr if (_rc) { display as error "variable 'thr' missing from file `incomecomposition'" exit 111 } // Only keep the tax data above the merging point quietly keep if (thr >= `mergingthr') quietly count if (r(N) == 0) { display as error "no income composition data above merging point; ignored" } else { // Store the thresholds and shares to calibrate on tempname compthr mkmat thr, matrix(`compthr') quietly ds thr, not local compvars = r(varlist) if ("`compvars'" == "") { display as error "covariates missing from file `incomecomposition'" exit 111 } tempname compshares mkmat `compvars', matrix(`compshares') local usecomp 1 } } // ---------------------------------------------------------------------- // // Import data for calibration on population composition by income // ---------------------------------------------------------------------- // local usepop 0 if ("`incomepopulation'" != "") { // Identify the format of the tax data if (substr("`incomepopulation'", -4, .) == ".xls" | substr("`incomepopulation'", -5, .) == ".xlsx") { // Excel format quietly import excel using "`incomepopulation'", firstrow clear } else if (substr("`incomepopulation'", -4, .) == ".csv") { // CSV format quietly import delimited using "`incomepopulation'", varnames(1) clear } else { // Stata format quietly use "`incomepopulation'", clear } capture confirm variable thr if (_rc) { display as error "variable 'thr' missing from file `incomepopulation'" exit 111 } capture confirm variable p if (_rc) { display as error "variable 'p' missing from file `incomepopulation'" exit 111 } // Only keep the tax data above the merging point quietly keep if (thr >= `mergingthr') quietly count if (r(N) == 0) { display as error "no population composition data above merging point; ignored" } else { // Store the thresholds and shares to calibrate on tempname popthr popperc mkmat thr, matrix(`popthr') mkmat p, matrix(`popperc') quietly ds thr p, not local popvars = r(varlist) if ("`popvars'" == "") { display as error "covariates missing from file `incomepopulation'" exit 111 } tempname popfreq mkmat `popvars', matrix(`popfreq') local usepop 1 } } // ---------------------------------------------------------------------- // // Perform linear calibration // ---------------------------------------------------------------------- // display as text "" display as text "Calibration" use "`rawdata'", clear // Create dummy variables for margins tempvar constant generate `constant' = 1 // We always have at least this margin for the sum of weights local dummies_margins `constant' if ("`holdmargins'" != "") { foreach v of varlist `holdmargins' { tempvar dummies_`v' quietly tabulate `v', generate(`dummies_`v'') local nvals_`v' = r(r) if (r(r) > 20) { display as error "warning: variable `v' has more than 20 different values,", _continue display as error "you should probably regroup them into fewer categories" } quietly levelsof `v', local(values_`v') unab new_dummies: `dummies_`v''* foreach u of varlist `new_dummies' { quietly replace `u' = 0 if missing(`u') } local dummies_margins `dummies_margins' `new_dummies' local values_margins `values_margins' `values_`v'' local nvals_margins `nvals_margins' `nvals_`v'' } } // Get value of margins tempvar margins_target quietly total `dummies_margins' [pw=`weight'] matrix define `margins_target' = e(b) // Add new margins to impose if ("`varmargins'" != "") { foreach v of varlist `varmargins' { tempvar newmargin`v' quietly generate `newmargin`v'' = cond(`v' == 1, 1, 0) local dummies_margins `dummies_margins' `newmargin`v'' local values_margins `values_margins' 1 local nvals_margins `nvals_margins' 1 } foreach a of numlist `freqmargins' { local b = `ind_popsize'*`a' matrix define `margins_target' = (`margins_target', `b') } } // Store bracket size margin values separately quietly save "`rawdata'", replace quietly merge n:1 `bracket_index' using "`taxdata'", nogenerate keep(master match) keepusing(`bracketsize') keep `bracket_index' `bracketsize' quietly drop if missing(`bracket_index') quietly duplicates drop sort `bracket_index' tempname margins_income mkmat `bracketsize', matrix(`margins_income') quietly use "`rawdata'", clear // Create dummy variables for survey income tax bracket tempvar dummies_income_svy quietly tabulate `bracket_index_svy', generate(`dummies_income_svy') unab dummies_income_svy: `dummies_income_svy'* foreach v of varlist `dummies_income_svy' { quietly replace `v' = 0 if missing(`v') } local dummies_margins_svy `dummies_margins' `dummies_income_svy' // Create dummy variables for income tax bracket tempvar dummies_income quietly tabulate `bracket_index', generate(`dummies_income') unab dummies_income: `dummies_income'* foreach v of varlist `dummies_income' { quietly replace `v' = 0 if missing(`v') } local dummies_margins `dummies_margins' `dummies_income' // Add margins from the tax data to the margins matrix define `margins_target' = (`margins_target', (`margins_income')') local values_margins `values_margins' `list_taxthr' local margins `holdmargins' `varmargins' `income' local nb_dummies_income: list sizeof dummies_income local nvals_margins `nvals_margins' `nb_dummies_income' // Create variables for income composition if (`usecomp') { local n = rowsof(`compshares') foreach v in `compvars' { capture confirm variable `v' if (_rc) { display as error "income composition variable `v' not found in the survey; ignored" continue } local j = colnumb(`compshares', "`v'") forvalue i = 1/`n' { local s = `compshares'[`i', `j'] if (`i' < `n') { tempvar share`v'`i' quietly generate `share`v'`i'' = (`taxincome' >= `compthr'[`i', 1] & `taxincome' < `compthr'[`i' + 1, 1])*(`v' - `s'*`taxincome')/`compthr'[`i', 1] quietly replace `share`v'`i'' = 0 if missing(`v') } else { tempvar share`v'`i' quietly generate `share`v'`i'' = (`taxincome' >= `compthr'[`i', 1])*(`v' - `s'*`taxincome')/`compthr'[`i', 1] quietly replace `share`v'`i'' = 0 if missing(`v') } local var_margins_comp `var_margins_comp' `share`v'`i'' matrix define `margins_target' = (`margins_target', 0) local val = `compthr'[`i', 1] local values_margins `values_margins' `val' } local nvars = `nvars' + 1 local margins `margins' "__comp_`v'" local nvals_margins `nvals_margins' `n' } } // Create variables for population composition by income if (`usepop') { local n = rowsof(`popfreq') foreach v in `popvars' { capture confirm variable `v' if (_rc) { display as error "population composition variable `v' not found in the survey; ignored" continue } capture assert inlist(`v', 0, 1) | missing(`v') if (_rc) { display as error "population composition variable `v' is not a dummy; ignored" } local j = colnumb(`popfreq', "`v'") forvalue i = 1/`n' { local s = `popfreq'[`i', `j'] if (`i' < `n') { tempvar pop`v'`i' quietly generate `pop`v'`i'' = (`taxincome' >= `popthr'[`i', 1] & `taxincome' < `popthr'[`i' + 1, 1])*`v' quietly replace `pop`v'`i'' = 0 if missing(`v') matrix define `margins_target' = (`margins_target', `popsize'*`s'*(`popperc'[`i' + 1, 1] - `popperc'[`i', 1])) } else { tempvar pop`v'`i' quietly generate `pop`v'`i'' = (`taxincome' >= `popthr'[`i', 1])*`v' quietly replace `pop`v'`i'' = 0 if missing(`v') matrix define `margins_target' = (`margins_target', `popsize'*`s'*(1 - `popperc'[`i', 1])) } local var_margins_comp `var_margins_comp' `pop`v'`i'' local val = `popthr'[`i', 1] local values_margins `values_margins' `val' } local nvars = `nvars' + 1 local margins `margins' "__pop_`v'" local nvals_margins `nvals_margins' `n' } } // Collapse by household to enforce equal weights within household tempfile data_nocollapse quietly save "`data_nocollapse'" if ("`var_margins_comp'" == "") { collapse (mean) `weight' (sum) `dummies_margins' (sum) `dummies_income_svy', by(`households') } else { collapse (mean) `weight' (sum) `dummies_margins' (sum) `dummies_income_svy' (sum) `var_margins_comp', by(`households') } // If the unit is households, then the income dummies must refer to that unit. // Either the collpased income dummy is zero, meaning that the household isn't in the bracket // therefore we leave it, or it is equal to the household size (>= 1) and we set it to one if ("`taxunit'" == "h") { foreach v of varlist `dummies_income' { quietly replace `v' = 1 if (`v' >= 1) } foreach v of varlist `dummies_income_svy' { quietly replace `v' = 1 if (`v' >= 1) } } local dummies_margins `dummies_margins' `var_margins_comp' local dummies_margins_svy `dummies_margins_svy' `var_margins_comp' // Get corresponding view matrices mata: X = 0 mata: Z = 0 mata: d = 0 mata: st_view(Z, ., st_local("dummies_margins")) mata: st_view(X, ., st_local("dummies_margins_svy")) mata: st_view(d, ., st_local("weight")) // Get the target value of the margins mata: M = st_matrix(st_local("margins_target"))' // Get calibration results tempname coefs mata: results = calibrate(d, X, Z, M) if (`success' == 1) { display as text "convergence achieved" } else { display as error "convergence not achieved" exit 430 } tempvar calweight calfactor quietly generate `calweight' = . mata: st_store(., st_local("calweight"), results) quietly generate `calfactor' = `calweight'/`weight' // Save the new weights tempfile new_weights quietly save "`new_weights'" // Re-import non-collapsed dataset quietly merge 1:n `households' using "`data_nocollapse'", nogenerate assert(match) // ---------------------------------------------------------------------- // // Store calibration factors // ---------------------------------------------------------------------- // tempname adjfactors rfactors local i = 1 local nvars: list sizeof margins forvalues j = 1/`nvars' { local var: word `j' of `margins' local nvals: word `j' of `nvals_margins' forvalues k = 1/`nvals' { local val: word `i' of `values_margins' if ("`var'" == "`income'") { local index: word `k' of `list_bracketindex' quietly summarize `calfactor' if (`bracket_index' == `index'), de matrix `rfactors' = (`coefs'[`i' + 1, 1], r(mean), r(min), r(p50), r(max)) matrix rownames `rfactors' = "`var':`val'" matrix define `adjfactors' = (nullmat(`adjfactors') \ `rfactors') } else if (substr("`var'", 1, 7) == "__comp_") { local var2 = substr("`var'", 8, .) if (`k' < `nvals') { quietly summarize `calfactor' if (`taxincome' >= `compthr'[`k', 1]) & (`taxincome' < `compthr'[`k' + 1, 1] & `var2' != 0), de } else { quietly summarize `calfactor' if (`taxincome' >= `compthr'[`k', 1] & `var2' != 0), de } matrix `rfactors' = (`coefs'[`i' + 1, 1], r(mean), r(min), r(p50), r(max)) matrix rownames `rfactors' = "`taxincome'#`var2':`val'" matrix define `adjfactors' = (nullmat(`adjfactors') \ `rfactors') } else if (substr("`var'", 1, 6) == "__pop_") { local var2 = substr("`var'", 7, .) if (`k' < `nvals') { quietly summarize `calfactor' if (`taxincome' >= `popthr'[`k', 1]) & (`taxincome' < `popthr'[`k' + 1, 1] & `var2' == 1), de } else { quietly summarize `calfactor' if (`taxincome' >= `popthr'[`k', 1] & `var2' == 1), de } matrix `rfactors' = (`coefs'[`i' + 1, 1], r(mean), r(min), r(p50), r(max)) matrix rownames `rfactors' = "`taxincome'#`var2':`val'" matrix define `adjfactors' = (nullmat(`adjfactors') \ `rfactors') } else { quietly summarize `calfactor' if (`var' == `val'), de matrix `rfactors' = (`coefs'[`i' + 1, 1], r(mean), r(min), r(p50), r(max)) matrix rownames `rfactors' = "`var':`val'" matrix define `adjfactors' = (nullmat(`adjfactors') \ `rfactors') } local i = `i' + 1 } } quietly summarize `calfactor', de matrix `rfactors' = (`coefs'[1, 1], r(mean), r(min), r(p50), r(max)) matrix rownames `rfactors' = "_cons" matrix define `adjfactors' = (nullmat(`adjfactors') \ `rfactors') matrix colnames `adjfactors' = "coef" "mean" "min" "median" "max" // ---------------------------------------------------------------------- // // Perform replacing & imputing // ---------------------------------------------------------------------- // if ("`noreplace'" == "") { display as text "" display as text "Replacing", _continue // Identify the point where replacing starts: it can be above the // merging if extrapolation was involved local replacepoint = max(`mergingpoint', `truststart') quietly use "`taxdata'", clear tempvar replace_region generate `replace_region' = (`taxperc' >= `replacepoint') quietly summarize `taxthr' if `replace_region', meanonly local replacethr = r(min) quietly summarize `taxperc' if `replace_region', meanonly local replacepoint = r(min) display as text "for p >= " round(`replacepoint', 0.001) ", `taxincome' >= " round(`replacethr') // Truncate sampletop to make sure there's not more observations in the // survey than in the real population if (`sampletop' > 1) { local sampletop = 1 } // Store the full tax data in Mata use "`rawtaxdata'", clear quietly keep if (`taxperc' >= `replacepoint') mata: p = st_data(., st_local("taxperc")) mata: thr = st_data(., st_local("taxthr")) mata: avg = st_data(., st_local("taxavg")) if ("`taxunit'" == "i") { // Import the original data quietly use "`rawdata'", clear // Add new weights quietly merge n:1 `households' using "`new_weights'", keepusing(`calweight') nogenerate // Define a numeric household id tempvar hid egen `hid' = group(`households') // Define a numeric personal id tempvar pid generate `pid' = _n // Separate households with some individuals above merging point (to be replaced), // and all individuals below the merging point (to preserve) tempfile hh_all hh_below hh_above sort `pid' `hid' quietly save "`hh_all'" tempvar nhhmem_above quietly egen `nhhmem_above' = total(`taxincome' >= `replacethr'), by(`hid') quietly keep if (`nhhmem_above' == 0) quietly drop `nhhmem_above' quietly save "`hh_below'" quietly use "`hh_all'" quietly egen `nhhmem_above' = total(`taxincome' >= `replacethr'), by(`hid') quietly keep if (`nhhmem_above' > 0) quietly drop `nhhmem_above' quietly save "`hh_above'" quietly count local n_before = r(N) // Break down observations into small ones keep `hid' `calweight' quietly duplicates drop `hid', force tempvar nexpand quietly generate `nexpand' = `sampletop'*`calweight' // Pseudo-random rounding to preserve sum using a halton sequence tempvar halton_unif quietly generate `halton_unif' = . mata: st_store(., st_varindex(st_local("halton_unif")), halton(st_nobs(), 1)) quietly replace `nexpand' = floor(`nexpand') + (`halton_unif' < (`nexpand' - floor(`nexpand'))) drop `halton_unif' // Make sure that all observations are used at least once (no loss of information) quietly replace `nexpand' = 1 if (`nexpand' < 1) // Adjust weights based on the number of observations to expand quietly replace `calweight' = `calweight'/`nexpand' quietly expand `nexpand' drop `nexpand' tempvar new_hid quietly generate `new_hid' = _n // Re-merge original households to get all the covariates tempvar _merge joinby `hid' using "`hh_above'", _merge(`_merge') drop `_merge' quietly count display as text "`n_before' => " r(N) " observations (×" round(r(N)/`n_before', 0.01) ")" // Scramble people's ranks above replace point preserve quietly keep if (`taxincome' >= `replacethr') gsort -`taxincome' tempvar idx quietly generate `idx' = _n tempvar idxstart idxend collapse (min) `idxstart'=`idx' (max) `idxend'=`idx', by(`hid' `pid') sort `idxstart' tempvar idxmin idxmax quietly generate `idxmin' = `idxstart'[max(1, _n - `knn')] quietly generate `idxmax' = `idxend'[min(_N, _n + `knn')] drop `idxstart' `idxend' tempfile list_idx quietly save "`list_idx'", replace restore quietly merge n:1 `hid' `pid' using "`list_idx'", nogenerate gsort -`taxincome' quietly replace `idxmin' = _n if missing(`idxmin') quietly replace `idxmax' = _n if missing(`idxmax') tempvar halton_unif quietly generate `halton_unif' = . mata: st_store(., st_varindex(st_local("halton_unif")), halton(st_nobs(), 1)) tempvar idx quietly generate `idx' = `idxmin' + (`idxmax' - `idxmin')*`halton_unif' sort `idx' quietly drop `halton_unif' `idx' `idxmin' `idxmax' // Determine a new rank in the individual distribution for people above the threshold gsort -`taxincome' tempvar to_replace rank quietly generate `to_replace' = (`taxincome' >= `replacethr') quietly generate double `rank' = 1 - sum(`calweight')/`ind_popsize' if `to_replace' // Attribute an income to each observation using the optimal transport map tempvar new_income quietly generate `new_income' = . mata: s = . mata: st_view(s, ., st_local("rank"), st_local("to_replace")) mata: st_store(., st_varindex(st_local("new_income")), st_local("to_replace"), ot_income(p, thr, avg, s)) // Compute a adjustment factors tempvar coef quietly generate `coef' = `new_income'/`taxincome' quietly replace `coef' = 1 if missing(`coef') // Add other households tempvar below quietly generate `below' = 0 append using "`hh_below'" quietly replace `below' = 1 if missing(`below') quietly replace `coef' = 1 if `below' // Generate variables quietly generate _correction = cond(`below', 1, 2) label define _correction 1 "reweighted" 2 "replaced" label values _correction _correction quietly generate _weight = `calweight' quietly generate _factor = `coef' quietly replace `taxincome' = _factor*`taxincome' if ("`taxincome'" != "`income'") { quietly replace `income' = _factor*`income' } quietly replace `weight' = . if _correction == 2 // Generate the new household and personal IDs sort `taxincome' quietly egen _hid = group(`hid' `new_hid'), missing gsort _hid `pid' quietly by _hid: generate _pid = _n } else { // Import the original data quietly use "`rawdata'", clear // Define a numeric household id tempvar hid egen `hid' = group(`households') sort `taxincome' // Add new weights quietly merge n:1 `households' using "`new_weights'", keepusing(`calweight') nogenerate // Separate households with above merging point (to be replaced), // and all individuals below the merging point (to preserve) tempfile hh_all hh_above hh_below quietly save "`hh_all'" quietly drop if (`taxincome' > `replacethr') quietly save "`hh_below'" quietly use "`hh_all'" quietly drop if (`taxincome' <= `replacethr') quietly save "`hh_above'" quietly count local n_before = r(N) // Break down observations into small ones keep `hid' `calweight' `taxincome' quietly duplicates drop `hid', force tempvar nexpand quietly generate `nexpand' = `sampletop'*`calweight' // Pseudo-random rounding to preserve sum using a halton sequence tempvar halton_unif quietly generate `halton_unif' = . mata: st_store(., st_varindex(st_local("halton_unif")), halton(st_nobs(), 1)) quietly replace `nexpand' = floor(`nexpand') + (`halton_unif' < (`nexpand' - floor(`nexpand'))) drop `halton_unif' // Make sure that all observations are used at least once (no loss of information) quietly replace `nexpand' = 1 if (`nexpand' < 1) // Adjust weights based on the number of observations to expand quietly replace `calweight' = `calweight'/`nexpand' quietly expand `nexpand' drop `nexpand' tempvar new_hid quietly generate `new_hid' = _n quietly count display as text "`n_before' => " r(N) " observations (×" round(r(N)/`n_before', 0.01) ")" // Scramble households' ranks preserve gsort -`taxincome' tempvar idx quietly generate `idx' = _n tempvar idxstart idxend collapse (min) `idxstart'=`idx' (max) `idxend'=`idx', by(`hid') sort `idxstart' tempvar idxmin idxmax quietly generate `idxmin' = `idxstart'[max(1, _n - `knn')] quietly generate `idxmax' = `idxend'[min(_N, _n + `knn')] drop `idxstart' `idxend' tempfile list_idx quietly save "`list_idx'", replace restore quietly merge n:1 `hid' `pid' using "`list_idx'", nogenerate gsort -`taxincome' quietly replace `idxmin' = _n if missing(`idxmin') quietly replace `idxmax' = _n if missing(`idxmax') tempvar halton_unif quietly generate `halton_unif' = . mata: st_store(., st_varindex(st_local("halton_unif")), halton(st_nobs(), 1)) tempvar idx quietly generate `idx' = `idxmin' + (`idxmax' - `idxmin')*`halton_unif' sort `idx' quietly drop `halton_unif' `idx' `idxmin' `idxmax' // Determine a new rank in the individual distribution for people above the threshold tempvar rank quietly generate double `rank' = 1 - sum(`calweight')/`hh_popsize' // Attribute an income to each observation using the optimal transport map tempvar new_income quietly generate `new_income' = . mata: s = . mata: st_view(s, ., st_local("rank")) mata: st_store(., st_varindex(st_local("new_income")), ot_income(p, thr, avg, s)) // Merge full households with covariates tempvar _merge joinby `hid' using "`hh_above'", _merge(`_merge') drop `_merge' // Compute a adjustment factors tempvar coef quietly generate `coef' = `new_income'/`taxincome' quietly replace `coef' = 1 if missing(`coef') // Add other households tempvar below quietly generate `below' = 0 append using "`hh_below'" quietly replace `below' = 1 if missing(`below') quietly replace `coef' = 1 if `below' // Generate variables quietly generate _correction = cond(`below', 1, 2) label define _correction 1 "reweighted" 2 "replaced" label values _correction _correction quietly generate _weight = `calweight' quietly generate _factor = `coef' quietly replace `taxincome' = _factor*`taxincome' if ("`taxincome'" != "`income'") { quietly replace `income' = _factor*`income' } quietly replace `weight' = . if _correction == 2 // Generate the new household and personal IDs sort `taxincome' quietly egen _hid = group(`hid' `new_hid'), missing gsort _hid `pid' quietly by _hid: generate _pid = _n /* // Add other households tempvar below quietly generate `below' = 0 append using "`hh_below'" quietly replace `below' = 1 if missing(`below') quietly replace `new_income' = `taxincome' if missing(`new_income') // Generate variables quietly generate _correction = cond(`below', 1, 2) label define _correction 1 "reweighted" 2 "replaced" label values _correction _correction quietly generate _weight = `calweight' quietly generate _factor = cond(`new_income' == `taxincome', 1, `new_income'/`taxincome') quietly replace `taxincome' = `new_income' quietly replace `weight' = . if _correction == 2 // Generate the new household and personal IDs sort `taxincome' quietly egen _hid = group(`hid' `new_hid'), missing gsort _hid `pid' quietly by _hid: generate _pid = _n */ } } else { use "`rawdata'", clear // Add new weights quietly merge n:1 `households' using "`new_weights'", keepusing(`calweight') nogenerate quietly generate _correction = 1 label define _correction 1 "reweighted" 2 "replaced" label values _correction _correction quietly generate _weight = `calweight' quietly generate _factor = 1 quietly egen _hid = group(`households') gsort _hid `pid' quietly by _hid: generate _pid = _n } label variable _correction "type of correction applied" label variable _weight "corrected survey weight" label variable _factor "income adjustment factor" label variable _hid "household ID" label variable _pid "personal ID" display as text "" display as text "The following variables were created (see {help bfmcorr} for details):" display as text _col(4) "{bf:_correction} type of correction applied" display as text _col(4) "{bf:_weight} corrected survey weight" display as text _col(4) "{bf:_factor} income adjustment factor" display as text _col(4) "{bf:_hid} household ID" display as text _col(4) "{bf:_pid} personal ID" display as text "" display as text "See {help postbfm} for postestimation commands." // ---------------------------------------------------------------------- // // Return some data, mostly for postestimation commands // ---------------------------------------------------------------------- // // Define locals with structure of missing population quietly sum _weight if `income'>`y_max_old' local unobs_pop=r(sum)/`popsize' local above_MP_tax=1-`mergingpoint' local corrected_pop=`above_MP_tax'-`above_MP_svy' local unobs_sh=`unobs_pop'/(abs(`corrected_pop')) local other_pop=`corrected_pop' - `unobs_pop' local other_sh=(`other_pop'/`corrected_pop') // Scalars ereturn scalar truststart = `truststart' ereturn scalar mergingpoint = `mergingpoint' ereturn scalar y_max_old=`y_max_old' ereturn scalar above_MP_svy=`above_MP_svy' ereturn scalar unobs_pop=`unobs_pop' ereturn scalar above_MP_tax=`above_MP_tax' ereturn scalar corrected_pop=`corrected_pop' ereturn scalar unobs_sh=`unobs_sh' ereturn scalar other_sh=`other_sh' if ("`sigma'" != "") { ereturn scalar var_res=`sigma' } // Macros ereturn local income_var "`income'" // Matrices if ("`theta'" != "") { ereturn matrix theta = `theta' } if ("`beta_ridge'" != "") { ereturn matrix beta_ridge = `beta_ridge' } ereturn matrix mat_lorenz_old = `mat_lorenz_old' ereturn matrix mat_sum_old = `mat_sum_old' ereturn matrix adj_factors = `adjfactors' end // -------------------------------------------------------------------------- // // Mata code performing the main matrix operations // -------------------------------------------------------------------------- // mata: mata set matastrict on // Perform one-dimensional ridge regression real vector ridge_regression(real vector y, real vector x, real vector w, real scalar beta, real scalar lambda) { real matrix Q, X, b0 real scalar n // Number of observations n = length(y) // Normalize weights w = w/quadsum(w)*n // Penalization matrix Q = (0, 0 \ 0, lambda) // Prior values for the parameter b0 = (0 \ beta) // Add an intervept to idenpendent variables X = (J(n, 1, 1), x) return(qrsolve(quadcross(X, w, X) + Q, quadcross(X, w, y) + Q*b0)) } // Perform isotonic regression using Pool Adjacent Violators Algorithm // (see http://stat.wikia.com/wiki/Isotonic_regression) real vector isotonic_pava(real vector w, real vector a, real scalar reverse) { real vector y, a_prime, w_prime, S real scalar n, j if (reverse) { a = -a } n = length(a) y = J(n, 1, .) j = 1 S = J(n, 1, .) S[1] = 0 S[2] = 1 a_prime = J(n, 1, .) a_prime[1] = a[1] w_prime = J(n, 1, .) w_prime[1] = w[1] for (i = 2; i <= n; i++) { j = j + 1 a_prime[j] = a[i] w_prime[j] = w[i] while (1) { if (j <= 1) { break } if (a_prime[j] >= a_prime[j - 1]) { break } a_prime[j - 1] = (w_prime[j]*a_prime[j] + w_prime[j - 1]*a_prime[j - 1])/(w_prime[j] + w_prime[j - 1]) w_prime[j - 1] = w_prime[j] + w_prime[j - 1] j = j - 1 } S[j + 1] = i } for (k = 1; k <= j; k++) { for (l = S[k] + 1; l <= S[k + 1]; l++) { y[l] = a_prime[k] } } if (reverse) { y = -y } return(y) } // Give the result of a linear calibration problem with the constraint that // weights are greater than one real vector calibrate(real vector d, real matrix X, real matrix Z, real vector M) { real matrix inv_sigma, X_free, X_cns, Z_free, Z_cns, d_free real vector w, free, m, M_cns, below_one, too_high, too_low real scalar rank, N, i, thetalimit thetalimit = strtoreal(st_local("thetalimit")) // All weights are intially free. We will progressively constrain those that // fall below one or when the calibration exceed the limits w = d free = J(rows(d), 1, 1) i = 1 while (1) { display("{text:iteration " + strofreal(i) + ": " + strofreal(sum(!free)) + " weight(s) constrained}") st_select(X_free, X, free) st_select(X_cns, X, !free) st_select(Z_free, Z, free) st_select(Z_cns, Z, !free) st_select(d_free, d, free) // Remove constrained observations from the margins M_cns = M :- quadcolsum(Z_cns)' // Calculate population size N = quadsum(d_free) // Calculate survey margins m = quadcolsum(Z_free :* d_free)' // Calculate the new weights H = svsolve(quadcross(Z_free, d_free, X_free), M_cns - m, rank, 1e-16) // Only update free weights w[selectindex(free)] = d_free :* (1 :+ (X_free * H)) // Check if there are invalid weights below_one = (w :<= 1) too_low = (w:/d :< 1/thetalimit) too_high = (w:/d :> thetalimit) if (sum((below_one :| too_low :| too_high) :& free) == 0) { // Algorithm converged st_matrix(st_local("coefs"), H) st_local("success", "1") return(w) } if (sum(below_one) >= 0.5*rows(w)) { // Algorithm did not converge st_local("success", "0") return(w) } // Constrain all the weights that violate the constraints free[selectindex(too_low :| below_one)] = J(sum(too_low :| below_one), 1, 0) w[selectindex(too_low :| below_one)] = rowmax(( J(sum(too_low :| below_one), 1, 1), d[selectindex(too_low :| below_one)]/thetalimit )) free[selectindex(too_high)] = J(sum(too_high), 1, 0) w[selectindex(too_high)] = d[selectindex(too_high)]*thetalimit i = i + 1 } } real vector ot_income(real vector p, real vector thr, real vector avg, real vector s) { // Size of the tabulation k = length(p) // Number of observations to attribute income to n = length(s) y = J(n, 1, .) // Parameters of the Pareto distribution at the top mu = thr[k] beta = avg[k]/thr[k] alpha = beta/(beta - 1) // Parameters of the uniform distributions (mean-split histogram) a = J(2*(k - 1), 1, .) b = J(2*(k - 1), 1, .) q = J(2*(k - 1), 1, .) for (i = 1; i < k; i++) { a[2*i - 1, 1] = thr[i] b[2*i - 1, 1] = avg[i] q[2*i - 1, 1] = (p[i + 1] - p[i])*(thr[i + 1] - avg[i])/(thr[i + 1] - thr[i]) a[2*i, 1] = avg[i] b[2*i, 1] = thr[i + 1] q[2*i, 1] = (p[i + 1] - p[i])*(avg[i] - thr[i])/(thr[i + 1] - thr[i]) } q = (p[1] \ p[1] :+ runningsum(q))[1..(2*(k - 1))] // We start from the top, where we assume a Pareto distribution p0 = p[k] y[1] = thr[k]*beta*((1 - s[1])/(1 - p[k]))^(-1/alpha) i = 2 while (s[i] > p[k]) { p1 = s[i - 1] p2 = s[i] y[i] = ((-(alpha*(-1 + p1)*(1 - p2)^(1/alpha)) + alpha*(1 - p1)^(1/alpha)*(-1 + p2))* ((1 - p0)/((-1 + p1)*(-1 + p2)))^(1/alpha)*thr[k])/((-1 + alpha)*(-p1 + p2)) i = i + 1 } // Observation covering both brackets p1 = s[i - 1] p2 = p[k] y1 = ((-(alpha*(-1 + p1)*(1 - p2)^(1/alpha)) + alpha*(1 - p1)^(1/alpha)*(-1 + p2))* ((1 - p0)/((-1 + p1)*(-1 + p2)))^(1/alpha)*thr[k])/((-1 + alpha)*(-p1 + p2)) p0 = q[2*(k - 1)] p1 = p[k] p2 = s[i] p3 = p[k] y2 = (b[2*(k - 1)]*(2*p0 - p1 - p2) + a[2*(k - 1)]*(p1 + p2 - 2*p3))/(2*(p0 - p3)) y[i] = y1*(p[k] - s[i - 1])/(s[i] - s[i - 1]) + y2*(s[i] - p[k])/(s[i] - s[i - 1]) i = i + 1 // Next observations, covered by the mean-split histogram l = 2*(k - 1) while (i <= n) { if (s[i] <= p0 & l > 1) { l = l - 1 p0 = q[l] p3 = q[l + 1] } p1 = s[i] p2 = s[i - 1] y[i] = (b[l]*(2*p0 - p1 - p2) + a[l]*(p1 + p2 - 2*p3))/(2*(p0 - p3)) i = i + 1 } return(y) } end