*! _diddesign_prep.ado - Data preparation for DID estimation *! *! Prepares panel data or repeated cross-sectional data for difference-in- *! differences estimation. Implements time normalization, group indicator *! construction, and lagged outcome transformation for the double DID estimator. version 16.0 program define _diddesign_prep, rclass // ========================================================================= // SECTION 1: INPUT PARAMETERS // ========================================================================= syntax, /// OUTCOME(varname) /// Outcome variable TREATment(varname) /// Treatment indicator TIME(varname) /// Time variable [ID(varname)] /// Unit ID (panel only) [POST(varname)] /// Post indicator (RCS only) [CLuster(varname)] /// Cluster variable [COVariates(varlist)] /// Covariates (already expanded by caller) [TOUSE(varname)] /// Sample marker [IDTIMEVAR(name)] /// Internal id_time variable name [IDTIMESTDVAR(name)] /// Internal standardized time variable name [GIVAR(name)] /// Internal Gi variable name [ITVAR(name)] /// Internal It variable name [DELTAVAR(name)] /// Internal outcome_delta variable name [PANEL] /// Panel data flag [RCS] /// RCS data flag // Determine data type local is_panel = ("`panel'" != "") local is_rcs = ("`rcs'" != "") // Default to panel if neither specified but id is provided if !`is_panel' & !`is_rcs' { if "`id'" != "" { local is_panel = 1 } else { display as error "E003: Must specify panel or rcs data type" exit 198 } } if "`idtimevar'" == "" local idtimevar "_did_id_time" if "`idtimestdvar'" == "" local idtimestdvar "_did_id_time_std" if "`givar'" == "" local givar "_did_Gi" if "`itvar'" == "" local itvar "_did_It" if "`deltavar'" == "" local deltavar "_did_outcome_delta" // ========================================================================= // SECTION 2: DATA VALIDATION // ========================================================================= // Validates data structure requirements for DID estimation: // - Binary treatment indicator (0/1) // - Presence of both treatment and control observations // - For RCS: binary post-treatment indicator // Initialize sample marker if not provided if "`touse'" == "" { tempvar touse gen byte `touse' = 1 } // Preserve the caller's row order so panel bysort/egen steps do not // leak id-label-sensitive ordering into downstream bootstrap frames. tempvar did_obs_order quietly gen long `did_obs_order' = _n if `touse' // Count observations quietly count if `touse' local N = r(N) if `N' == 0 { display as error "E003: No observations in sample" exit 2000 } // Tiny float outcomes are already quantized before Mata sees the data. // Reject them here so storage-type-dependent ATT / VCOV results do not // silently propagate through the standard DID pipeline. capture confirm float variable `outcome' if _rc == 0 { quietly summarize `outcome' if `touse' & !missing(`outcome'), meanonly if r(N) > 0 { local outcome_abs_max = max(abs(r(min)), abs(r(max))) // Moderate float outcomes remain stable at much larger scales, but // values below this cutoff are small enough to make ATT depend on // float versus double storage after repeated centering/differencing. local float_tiny_cutoff = 1e-10 if `outcome_abs_max' > 0 & `outcome_abs_max' <= `float_tiny_cutoff' { local outcome_abs_max_txt : display %21.15g `outcome_abs_max' local float_tiny_cutoff_txt : display %21.15g `float_tiny_cutoff' display as error "E003: Outcome variable `outcome' is stored as float at an unsupported tiny scale" display as error " Recast `outcome' to double before estimation" display as error " max(abs(`outcome')) = `outcome_abs_max_txt' is <= `float_tiny_cutoff_txt'" exit 198 } } } // Canonicalize treatment to exact 0/1 before downstream preparation. tempvar treatment_work quietly gen double `treatment_work' = . if `touse' quietly replace `treatment_work' = 0 if abs(`treatment') < 1e-6 & `touse' quietly replace `treatment_work' = 1 if abs(`treatment' - 1) < 1e-6 & `touse' quietly count if missing(`treatment_work') & `touse' if r(N) > 0 { display as error "E003: Treatment variable must be binary (0/1)" display as error " Found `r(N)' observations outside the 1e-6 tolerance around 0/1" exit 198 } // Check for treated observations (preliminary) // Definitive check is performed after Gi computation for panel data quietly count if `treatment_work' == 1 & `touse' if r(N) == 0 { display as error "E003: No treated units found in data" exit 198 } // Check for control observations (preliminary) // Definitive check is performed after Gi computation for panel data quietly count if `treatment_work' == 0 & `touse' if r(N) == 0 { display as error "E003: No control units found in data" exit 198 } local treatment "`treatment_work'" // For RCS: validate post indicator is binary if `is_rcs' { quietly summarize `post' if `touse' // Handle all-missing post variable if r(N) == 0 | missing(r(min)) | missing(r(max)) { display as error "E003: Post-treatment indicator contains only missing values" exit 198 } tempvar post_work quietly gen double `post_work' = . if `touse' quietly replace `post_work' = 0 if abs(`post') < 1e-6 & `touse' quietly replace `post_work' = 1 if abs(`post' - 1) < 1e-6 & `touse' quietly count if missing(`post_work') & `touse' if r(N) > 0 { display as error "E003: Post-treatment indicator must be binary (0/1)" display as error " Found `r(N)' observations outside the 1e-6 tolerance around 0/1" exit 198 } quietly tab `post_work' if `touse' if r(r) > 2 { display as error "E003: Post-treatment indicator must be binary (0/1)" display as error " Found " r(r) " distinct values after tolerance canonicalization (expected 2)" exit 198 } quietly summarize `post_work' if `touse' local post_min_round = r(min) local post_max_round = r(max) // Ensure both pre- and post-treatment observations exist if r(min) == r(max) { if `post_min_round' == 0 { display as error "E003: Post-treatment indicator is all 0 (no post-treatment observations)" } else { display as error "E003: Post-treatment indicator is all 1 (no pre-treatment observations)" } exit 198 } local post "`post_work'" } // ========================================================================= // SECTION 3: PANEL DATA PREPARATION // ========================================================================= // Prepares panel data by: // 1. Converting time variable to consecutive integers // 2. Identifying treatment time (first period where treatment == 1) // 3. Creating group indicator Gi (1 if unit ever treated) // 4. Creating post-treatment indicator It // 5. Creating standardized time index (centered at treatment time) if `is_panel' { // Panel DID requires each unit-time cell to appear at most once in // the estimation sample. Duplicate cells would otherwise distort the // prepared design matrix and bootstrap logic. tempvar did_dup_cell quietly bysort `id' `time': gen byte `did_dup_cell' = (_N > 1) if `touse' quietly count if `did_dup_cell' == 1 & `touse' if r(N) > 0 { display as error "E003: Panel data must be uniquely identified by id() and time()" display as error " Found duplicate unit-time observations in the estimation sample" exit 459 } // ----------------------------------------------------------------- // Step 1: Time index conversion // Convert time variable to consecutive integers for standardization // ----------------------------------------------------------------- tempvar id_time_n egen `id_time_n' = group(`time') if `touse' // ----------------------------------------------------------------- // Step 2: Identify treatment time // Treatment time is the minimum time period where treatment == 1 // ----------------------------------------------------------------- quietly summarize `id_time_n' if `treatment' == 1 & `touse', meanonly if r(N) == 0 { display as error "E003: Cannot identify treatment time" exit 198 } local treat_year = r(min) // Standard DID requires an absorbing treatment path within each unit. tempvar treat_bin treat_lag treat_diff quietly gen double `treat_bin' = round(`treatment', 1e-6) if `touse' quietly bysort `id' (`id_time_n'): gen double `treat_lag' = `treat_bin'[_n-1] if `touse' quietly gen double `treat_diff' = `treat_bin' - `treat_lag' if `touse' & `treat_lag' < . quietly count if `treat_diff' < 0 & `touse' if r(N) > 0 { display as error "E003: Treatment variable must be cumulative (absorbing)" display as error " Found `r(N)' observations with treatment decreasing over time" display as error " Standard DID requires treatment to remain 1 once it starts" exit 459 } // Standard DID requires a common treatment adoption time for all // eventually treated units. Later cohorts must use design(sa). tempvar first_treat_obs first_treat_unit unit_treat_tag quietly gen double `first_treat_obs' = `id_time_n' if `treatment' == 1 & `touse' quietly egen double `first_treat_unit' = min(`first_treat_obs') if `touse', by(`id') quietly egen `unit_treat_tag' = tag(`id') if `touse' & `first_treat_unit' < . quietly levelsof `first_treat_unit' if `unit_treat_tag' == 1 & `touse', local(first_treat_levels) local n_treat_times : word count `first_treat_levels' if `n_treat_times' > 1 { display as error "E003: Staggered adoption detected under the standard DID design" display as error " Treated units do not share a common treatment adoption time" display as error " Re-run with design(sa) for staggered-adoption estimation" exit 459 } // ----------------------------------------------------------------- // Step 3: Create group indicator (Gi) // Gi = 1 if unit is ever treated, 0 otherwise. // Rounding ensures Gi is exactly 0 or 1 for downstream comparisons. // ----------------------------------------------------------------- tempvar Gi_temp egen `Gi_temp' = max(`treatment') if `touse', by(`id') quietly replace `Gi_temp' = round(`Gi_temp', 1) if `touse' // ----------------------------------------------------------------- // Step 4: Create post-treatment indicator (It) // It = 1 if time period >= treatment time, 0 otherwise // ----------------------------------------------------------------- tempvar It_temp gen `It_temp' = (`id_time_n' >= `treat_year') if `touse' // Standard DID requires at least one pre-treatment period in the // estimation sample. Guard here so the failure does not leak into the // later lagged-mean merge step. quietly count if `It_temp' == 0 & `touse' if r(N) == 0 { display as error "E003: No pre-treatment observations in sample" display as error " Standard DID requires at least one pre-treatment period in the estimation sample" exit 198 } // ----------------------------------------------------------------- // Step 5: Create standardized time index // Centered at treatment time (treatment time = 0) // ----------------------------------------------------------------- tempvar id_time_std_temp gen `id_time_std_temp' = `id_time_n' - `treat_year' if `touse' // ----------------------------------------------------------------- // Step 6: Validate control units exist // After Gi is computed, units with Gi = 0 must exist // ----------------------------------------------------------------- quietly count if `Gi_temp' == 0 & `touse' if r(N) == 0 { display as error "E003: No control units found in data (all units eventually treated)" exit 198 } // ----------------------------------------------------------------- // Step 7: Count units and periods // ----------------------------------------------------------------- tempvar unit_tag egen `unit_tag' = tag(`id') if `touse' quietly count if `unit_tag' == 1 & `touse' local n_units = r(N) quietly summarize `id_time_n' if `touse' local n_periods = r(max) - r(min) + 1 // Store in caller-provided internal variables capture drop `idtimevar' capture drop `idtimestdvar' capture drop `givar' capture drop `itvar' gen `idtimevar' = `id_time_n' if `touse' gen `idtimestdvar' = `id_time_std_temp' if `touse' gen `givar' = `Gi_temp' if `touse' gen `itvar' = `It_temp' if `touse' local id_var = "`id'" } // ========================================================================= // SECTION 4: REPEATED CROSS-SECTIONAL DATA PREPARATION // ========================================================================= // Prepares repeated cross-sectional (RCS) data by: // 1. Assigning Gi directly from treatment variable // 2. Assigning It directly from post indicator variable // 3. Converting time variable to consecutive integers // 4. Creating standardized time index (centered at treatment time) // // Key difference from panel: Gi and It are directly observed, not derived else if `is_rcs' { // ----------------------------------------------------------------- // Step 1-2: Assign Gi and It directly // For RCS data, Gi (treatment group) and It (post period) are // directly observed rather than derived from panel structure. // ----------------------------------------------------------------- tempvar Gi_temp It_temp gen `Gi_temp' = `treatment' if `touse' gen `It_temp' = `post' if `touse' quietly replace `It_temp' = round(`It_temp', 1) if `touse' // ----------------------------------------------------------------- // Step 3: Time index conversion // Convert time variable to consecutive integers for standardization // ----------------------------------------------------------------- tempvar id_time_n egen `id_time_n' = group(`time') if `touse' // ----------------------------------------------------------------- // Step 4: Identify treatment time // For RCS, treatment time is the minimum time period where It == 1 // ----------------------------------------------------------------- quietly summarize `id_time_n' if abs(`It_temp' - 1) < 1e-6 & `touse', meanonly if r(N) == 0 { display as error "E003: Cannot identify treatment time from post indicator" exit 198 } local treat_year = r(min) // ----------------------------------------------------------------- // Step 5: Create standardized time index // Centered at treatment time (treatment time = 0) // ----------------------------------------------------------------- tempvar id_time_std_temp gen `id_time_std_temp' = `id_time_n' - `treat_year' if `touse' // ----------------------------------------------------------------- // Step 6: Count periods // Unit count is not applicable for RCS data // ----------------------------------------------------------------- local n_units = . // Not applicable for RCS quietly summarize `id_time_n' if `touse' local n_periods = r(max) - r(min) + 1 // Store in caller-provided internal variables capture drop `idtimevar' capture drop `idtimestdvar' capture drop `givar' capture drop `itvar' gen `idtimevar' = `id_time_n' if `touse' gen `idtimestdvar' = `id_time_std_temp' if `touse' gen `givar' = `Gi_temp' if `touse' gen `itvar' = `It_temp' if `touse' local id_var = "" // No unit ID for RCS } // ========================================================================= // SECTION 5: LAGGED OUTCOME TRANSFORMATION // ========================================================================= // Computes the lagged group mean transformation for the sequential DID // estimator, which is consistent under the parallel trends-in-trends // assumption: // // ΔY_{it} = Y_{it} - Ȳ_{Gi,t-1} // // where Ȳ_{Gi,t-1} is the mean outcome for group Gi at standardized time t-1. // This transformation enables the double DID estimator to combine standard // DID and sequential DID via GMM for optimal efficiency. // // Algorithm: // 1. Compute mean outcome for each (Gi, id_time_std) combination // 2. Shift time index by +1 to create the lag structure // 3. Merge lagged means back to original data // 4. Compute outcome_delta = outcome - lagged_group_mean // // The earliest period has missing outcome_delta (no prior period to lag from) // ========================================================================= // ----------------------------------------------------------------- // Step 1: Compute group-period means // Mean outcome is computed for each (Gi, id_time_std) combination. // Groups containing any missing values are assigned missing mean. // ----------------------------------------------------------------- // Use preserve/restore to create the collapsed dataset tempvar did_ymean did_merge preserve // Keep only needed variables and sample (quietly to suppress output) quietly keep if `touse' quietly keep `outcome' `givar' `idtimestdvar' // Compute means by (Gi, id_time_std) using the observed sample analogue. // Missing outcomes drop out of the group mean but do not invalidate the // entire group-period lag mean for other observations. quietly collapse (mean) `did_ymean' = `outcome', by(`givar' `idtimestdvar') // ----------------------------------------------------------------- // Step 2: Time-shift by one period // Shifting id_time_std by +1 creates the lagged structure. // After merge, each observation at time t receives the mean from t-1. // ----------------------------------------------------------------- quietly replace `idtimestdvar' = `idtimestdvar' + 1 // Save lagged means to tempfile tempfile lagged_means quietly save `lagged_means', replace restore // ----------------------------------------------------------------- // Step 3: Merge lagged means // Left join preserves all observations. Unmatched observations // (earliest period) receive missing lagged mean. // ----------------------------------------------------------------- quietly merge m:1 `givar' `idtimestdvar' using `lagged_means', /// keep(master match) generate(`did_merge') // Restore the caller's observation order so downstream routines that // preserve first-appearance semantics (for example bootstrap cluster // enumeration) remain invariant to the internal lagged-mean merge. quietly sort `did_obs_order' // ----------------------------------------------------------------- // Validate merge results // ----------------------------------------------------------------- quietly count if `did_merge' == 1 & `touse' // master only (no match) local n_nomatch = r(N) quietly count if `did_merge' == 3 & `touse' // matched local n_matched = r(N) if `n_matched' == 0 { display as error "E011: Merge failed - no observations matched lagged means" display as error " This may indicate data structure issues" quietly drop `did_merge' exit 198 } // Warning for high non-match rate (expected for earliest period, but warn if excessive) quietly count if `touse' local n_total_merge = r(N) local pct_nomatch = 100 * `n_nomatch' / `n_total_merge' if `pct_nomatch' > 50 { display as text "Warning: `pct_nomatch'% of observations had no matching lagged mean" } quietly drop `did_merge' // ----------------------------------------------------------------- // Step 4: Compute outcome delta // ΔY_{it} = Y_{it} - Ȳ_{Gi,t-1} // Missing lagged mean produces missing outcome_delta (earliest period). // ----------------------------------------------------------------- capture drop `deltavar' quietly gen double `deltavar' = `outcome' - `did_ymean' if `touse' // Clean up temporary Ymean variable (not needed after delta calculation) capture drop `did_ymean' capture drop `did_obs_order' // ----------------------------------------------------------------- // Step 5: Validate outcome delta // ----------------------------------------------------------------- // Count missing outcome_delta quietly count if missing(`deltavar') & `touse' local n_missing_delta = r(N) // Count observations in earliest period (expected to have missing delta) quietly summarize `idtimestdvar' if `touse', meanonly if r(N) == 0 | missing(r(min)) { display as text "Warning: Cannot determine earliest time period (all id_time_std missing)" local min_time_std = . local n_earliest = 0 } else { local min_time_std = r(min) quietly count if `idtimestdvar' == `min_time_std' & `touse' local n_earliest = r(N) } // Warning if more observations have missing delta than expected quietly count if `touse' local n_total = r(N) if `n_missing_delta' > `n_earliest' { local extra_missing = `n_missing_delta' - `n_earliest' display as text "Warning: `extra_missing' additional observations have missing outcome_delta" display as text " (beyond the `n_earliest' expected for earliest time period)" display as text " This may indicate missing outcome values in the data" } // General warning if high percentage missing // For short panels/RCS designs, the earliest period structurally lacks a // lagged group mean, so that baseline missingness alone should not be // reported as an abnormal high-missing share. local pct_missing = 100 * `n_missing_delta' / `n_total' if `pct_missing' > 30 & `n_periods' <= 3 & `n_missing_delta' == `n_earliest' { // This is expected for short designs when only the earliest period // lacks lagged data to compute outcome_delta. } else if `pct_missing' > 30 { display as text "Warning: `pct_missing'% of observations have missing outcome_delta" } // ========================================================================= // SECTION 6: MATA STRUCTURE POPULATION // ========================================================================= // Transfer prepared data to Mata structure for estimation // Store scalar values for Mata local treat_year_std = 0 // Standardized treatment time is defined as zero // Call Mata function to populate structure mata: st_local("mata_rc", strofreal(_diddesign_populate_data( /// "`outcome'", /* outcome variable name */ /// "`treatment'", /* treatment variable name */ /// "`id_var'", /* id variable name (or "") */ /// "`idtimevar'", /* id_time variable name */ /// "`covariates'", /* covariate variable names */ /// "`cluster'", /* cluster variable name */ /// "`givar'", /* Gi variable name */ /// "`itvar'", /* It variable name */ /// "`idtimestdvar'", /* id_time_std variable name */ /// "`deltavar'", /* outcome_delta var name */ /// `N', /* N observations */ /// `n_units', /* n_units */ /// `n_periods', /* n_periods */ /// `treat_year', /* treat_year (calendar-normalized) */ /// `is_panel', /* is_panel flag */ /// "`touse'" /* touse variable name */ /// ))) if `mata_rc' != 0 { display as error "Error populating Mata did_data structure" exit 498 } // ========================================================================= // SECTION 7: RETURN VALUES // ========================================================================= // Return prepared variable names and computed scalars to caller // Variable names (created variables) return local id_time = "`idtimevar'" return local id_time_std = "`idtimestdvar'" return local Gi = "`givar'" return local It = "`itvar'" return local outcome_delta = "`deltavar'" // Scalar values return scalar N = `N' return scalar n_units = `n_units' return scalar n_periods = `n_periods' return scalar treat_year = `treat_year' return scalar treat_year_std = `treat_year_std' return scalar is_panel = `is_panel' return scalar n_missing_delta = `n_missing_delta' end