*! version 2.0.1 27jul2013 Michael Stepner, stepner@mit.edu /* CC0 license information: To the extent possible under law, the author has dedicated all copyright and related and neighboring rights to this software to the public domain worldwide. This software is distributed without any warranty. This code is licensed under the CC0 1.0 Universal license. The full legal text as well as a human-readable summary can be accessed at http://creativecommons.org/publicdomain/zero/1.0/ */ * Why did I include a formal license? Jeff Atwood gives good reasons: http://www.codinghorror.com/blog/2007/04/pick-a-license-any-license.html program define vam version 10.1 set more off syntax varname(ts fv), teacher(varname) year(varname) class(varname) [ /// by(varlist) /// controls(varlist ts fv) absorb(varname) tfx_resid(varname) /// data(string) output(string) output_addvars(varlist) /// driftlimit(integer -1) /// QUASIexperiment] * Error checks local depvar `varlist' capture confirm variable score_r, exact if (_rc==0) { di as error "The dataset loaded in memory when vam is run cannot have a variable named score_r." exit 110 } capture confirm variable tv, exact if (_rc==0) { di as error "The dataset loaded in memory when vam is run cannot have a variable named tv." exit 110 } if ("`quasiexperiment'"!="") { capture confirm variable tv_2yr_l, exact if (_rc==0) { di as error "The dataset loaded in memory when vam is run cannot have a variable named tv_2yr_l." exit 110 } capture confirm variable tv_2yr_f, exact if (_rc==0) { di as error "The dataset loaded in memory when vam is run cannot have a variable named tv_2yr_f." exit 110 } capture confirm variable tv_ss, exact if (_rc==0) { di as error "The dataset loaded in memory when vam is run cannot have a variable named tv_ss." exit 110 } } local merge_tv=0 local merge_resid=0 if ("`data'"=="") local data="preserve" else { if !inlist("`data'","preserve","tv","merge tv","merge score_r","merge tv score_r","merge score_r tv","variance") { di as error "Not a valid argument for data. Choose either 'preserve', 'tv', 'merge [tv AND/OR score_r]', or 'variance'." exit 198 } else { tokenize "`data'" if ("`1'")=="merge" { if ("`2'"=="tv") | ("`3'"=="tv") local merge_tv=1 if ("`2'"=="score_r") | ("`3'"=="score_r") local merge_resid=1 } } } if "`tfx_resid'"!="" & "`absorb'"!="" { di as error "Cannot specify an absorb variable and a tfx_resid variable simultaneously." exit 198 } * If output was left blank, set a tempfile for the tv output if `"`output'"'=="" { tempfile output local nooutput=1 } else local nooutput=0 * Start log if (`nooutput'!=1) log using `"`output'_log"', replace * Process by variables if ("`by'"!="") { tempvar byvar egen `byvar'=group(`by'), label sum `byvar', meanonly local by_vals=`r(max)' } else local by_vals=1 **************** preserve *** Run through separately for each by-value. local firstloop=1 forvalues l=1/`by_vals' { if (`firstloop'!=1) restore, preserve *** Print heading (with by-variable identifier if applciable) di "{txt}{hline}" if ("`by'"!="") { local bylabel : label `byvar' `l', strict di "{bf:-> by variables:} `by' = `bylabel'" } *** Drop invalid observations *** qui drop if missing(`teacher',`year',`class') *** Keep only the correct by-value if ("`by'"!="") qui keep if `byvar'==`l' *** Run regression * If absorb or tfx_resid is not empty (only one is non-empty, otherwise an error was thrown), use areg if "`absorb'"!="" | "`tfx_resid'"!="" { qui areg `depvar' `controls' , absorb(`absorb'`tfx_resid') } * If absorb and tfx_resid are both empty, run regular regression else { qui reg `depvar' `controls' } *** Predict residuals * If tfx_resid is empty, predict residuals if "`tfx_resid'"=="" { qui predict score_r if e(sample),r } * If tfx_resid was specified, predict residuals + absorbed teacher fixed effects else { qui predict score_r if e(sample), dresiduals } *** Save residuals to a dataset if merging them later if `merge_resid'==1 { tempfile resid_data_`l' qui save `"`resid_data_`l''"', replace } *** Save number of parameters tempname num_obs num_par scalar `num_obs' = e(N) * If absorb is not empty (and tfx_resid is), save (number of slopes + number of clusters + 1) if "`absorb'"!="" { scalar `num_par' = e(df_m) + e(df_a) + 1 } * Otherwise, save (number of slopes + 1) else { scalar `num_par' = e(df_m) + 1 } *** Create var for number of students in class tempvar n_tested qui bys `teacher' `year' `group' `class': egen `n_tested' = count(score_r) *** Compute total variance *** tempvar class_mean index qui by `teacher' `year' `group' `class': egen `class_mean' = mean(score_r) qui by `teacher' `year' `group' `class': g `index' = _n tempname var_total qui sum score_r scalar `var_total' = r(Var)*((`num_obs' - 1)/(`num_obs' - `num_par')) *** Compute individual variance (i.e. within class variance) *--> note that we use rmse instead of direct variance of residuals here to deal with fact that class effects have not been shrunk tempname num_class var_ind var_class tempvar individual_dev_from_class qui gen `individual_dev_from_class' = score_r - `class_mean' qui count if `index'==1 & `n_tested'!=0 scalar `num_class' = r(N) qui sum `individual_dev_from_class' scalar `var_ind' = r(Var)*((`num_obs' - 1)/(`num_obs' - `num_class' - `num_par' + 1)) ********** Collapse to class-level data ********** qui by `teacher' `year' `group' `class': keep if _n==1 *** Estimate covariance of two classes taught by same teacher in the same year set seed 9827496 tempvar rand classnum g `rand'=uniform() bys `teacher' `year' (`rand'): gen `classnum'=_n * If there are multiple classes per teacher-year cell, compute the covariance. * Otherwise set to 0. Will display as missing in output, but internally set to 0 because it will never appear in the VCV, but the way things are coded requires that it be non-missing. tempname cov_sameyear corr_sameyear obs_sameyear qui sum `classnum' if (r(max)==1) { local missing_sameyear=1 scalar `cov_sameyear'=0 } else { local missing_sameyear=0 tempvar identifier egen `identifier'=group(`teacher' `year') qui tsset `identifier' `classnum'/*, noquery*/ qui corr `class_mean' f.`class_mean' [aw=`n_tested'+f.`n_tested'], cov scalar `cov_sameyear'=r(cov_12) scalar `corr_sameyear'=r(cov_12) / ( sqrt(r(Var_1)) * sqrt(r(Var_2)) ) scalar `obs_sameyear'=r(N) } *** Compute the variance of the class-level shock. Hits all kids in the class in the same way, but is unrelated across classes even taught by the same teacher in the same year. scalar `var_class' = `var_total' - `var_ind' - `cov_sameyear' if (`var_class'<0) { di as error "Note: var_class has been computed as being less than 0." di "var_class is defined as = var_total - var_ind - cov_sameyear." di "Computed variances: var_total, var_ind, cov_sameyear, var_class" di `var_total',`var_class',`var_ind',`cov_sameyear' di "This negative variance can occur because cov_sameyear is calculated using only the subsample of observations that teach multiple classes per year (in the same by-group)." } tempvar weight qui g `weight'=1/(`var_class' + `var_ind'/`n_tested') *** Keep teacher-years which have no weight tempvar excess_weight qui gen `excess_weight'=(missing(`weight')) qui replace `weight'=1 if missing(`weight') * note: adding this weight doesn't affect the class_mean, because missing observations are not included in the mean computation. it only affects the rawsum of weight, and so we remove it afterward. ********** Collapse to teacher-year level data using precision weights ********** collapse (mean) `class_mean' (rawsum) `weight' `n_tested' `excess_weight' [aw=`weight'], by(`teacher' `year' `by') fast * Remove the excess weight used to keep missing scores qui replace `weight'=`weight'-`excess_weight' *** Estimate the covariance of years t and t+i for every i, and store in vector m qui tsset `teacher' `year'/*, noquery*/ tempvar minyear maxyear diff validyear minvalidyear maxvalidyear diffvalid qui bys `teacher': egen `minyear'=min(`year') qui by `teacher': egen `maxyear'=max(`year') qui g `diff'=`maxyear'-`minyear' qui sum `diff' local maxspan=`r(max)' qui gen `validyear'=`year' if !missing(`class_mean') qui by `teacher': egen `minvalidyear'=min(`validyear') qui by `teacher': egen `maxvalidyear'=max(`validyear') qui g `diffvalid'=`maxvalidyear'-`minvalidyear' qui sum `diffvalid' local maxscorespan=`r(max)' if (`maxscorespan'<`maxspan') & (`driftlimit'<=0) { di as error _n "error: The maximum lags of teacher data is `maxspan', but the maximum lags of teacher data with class scores is `maxscorespan'." di as error " You must either set driftlimit() <= `maxscorespan', or drop observations so that the spans are no longer mismatched." exit 499 } if (`driftlimit'>`maxscorespan') { di as error "error: driftlimit(`driftlimit') was specified, which is greater than the number of lags (`maxscorespan') in the data." exit 499 } mata:CC=compute_cov_corr("`class_mean'","`n_tested'",`maxscorespan',"`teacher'") if (`driftlimit'>0) mata:m=create_m(CC[.,1],st_numscalar("`cov_sameyear'"),`maxspan',`driftlimit') else mata:m=create_m(CC[.,1],st_numscalar("`cov_sameyear'")) *** Print estimated variances and covariances di "Standard deviations: total, classes, students, teachers same year" if (`missing_sameyear'==0) di sqrt(`var_total'),sqrt(`var_class'),sqrt(`var_ind'),sqrt(`cov_sameyear') else di sqrt(`var_total'),sqrt(`var_class'),sqrt(`var_ind'),. di "Covariances (left), correlations (middle), observations (right). Row i indicates the relation between year t and t+i:" mata:CC[.,1..3] if (`driftlimit'>0) { di "Drift limit specified:" di `driftlimit' di "Covariances used for VA computations:" mata: m[2..length(m)]' } mata:check_m_nomissing(m) *** Accumulate the estimated variances/covariances/correlations across by-vals if (`firstloop'==1) { mata:cov_lag_accum= CC[.,1] mata:corr_lag_accum= CC[.,2] mata:obs_lag_accum= CC[.,3] mata:cov_se_lag_accum= CC[.,4] mata:var_total_accum= st_numscalar("`var_total'") mata:var_class_accum= st_numscalar("`var_class'") mata:var_ind_accum= st_numscalar("`var_ind'") if (`missing_sameyear'==1) { mata:cov_sameyear_accum=. mata:corr_sameyear_accum=. mata:obs_sameyear_accum=0 } else { mata:cov_sameyear_accum=st_numscalar("`cov_sameyear'") mata:corr_sameyear_accum=st_numscalar("`corr_sameyear'") mata:obs_sameyear_accum=st_numscalar("`obs_sameyear'") } } else { mata:cov_lag_accum= rightAppendMatrices(cov_lag_accum,CC[.,1]) mata:corr_lag_accum= rightAppendMatrices(corr_lag_accum,CC[.,2]) mata:obs_lag_accum= rightAppendMatrices(obs_lag_accum,CC[.,3]) mata:cov_se_lag_accum= rightAppendMatrices(cov_se_lag_accum,CC[.,4]) mata:var_total_accum= var_total_accum,st_numscalar("`var_total'") mata:var_class_accum= var_class_accum,st_numscalar("`var_class'") mata:var_ind_accum= var_ind_accum,st_numscalar("`var_ind'") if (`missing_sameyear'==1) { mata:cov_sameyear_accum= cov_sameyear_accum,. mata:corr_sameyear_accum= corr_sameyear_accum,. mata:obs_sameyear_accum= obs_sameyear_accum,. } else { mata:cov_sameyear_accum=cov_sameyear_accum,st_numscalar("`cov_sameyear'") mata:corr_sameyear_accum=corr_sameyear_accum,st_numscalar("`corr_sameyear'") mata:obs_sameyear_accum=obs_sameyear_accum,st_numscalar("`obs_sameyear'") } } ********* * Count the number of obs for each teacher sort `teacher' `year' tempvar obs_teacher by `teacher': egen `obs_teacher'=count(`teacher') * Compute teacher VA qui gen float tv=. if "`quasiexperiment'"!="" { qui gen float tv_2yr_l=. qui gen float tv_2yr_f=. qui gen float tv_ss=. mata: driftcalclist(vectorToStripeDiag(m), "`teacher'", "`year'", "`class_mean'", "`weight'", "`obs_teacher'", "tv", "tv_2yr_l", "tv_2yr_f", "tv_ss") } else mata:driftcalclist(vectorToStripeDiag(m), "`teacher'", "`year'", "`class_mean'", "`weight'", "`obs_teacher'", "tv") * Save the VA estimates to a dataset if ("`quasiexperiment'"=="") keep `teacher' `year' `by' tv else keep `teacher' `year' `by' tv tv_2yr_l tv_2yr_f tv_ss if (`firstloop'!=1) append using `"`output'"', nolabel qui save `"`output'"', replace * Turn firstloop counter off local firstloop=0 } di "{txt}{hline}" * Save VA estimates if "`output_addvars'"!="" quietly { restore, preserve keep `teacher' `year' `by' `output_addvars' bys `teacher' `year' `by' `output_addvars': keep if _n==1 merge m:1 `teacher' `year' `by' using `"`output'"', nogen nolabel } sort `teacher' `year' `by' qui save `"`output'"', replace * Save "variances / covariances / correlations" dataset to csv if ("`by'"!="") { local bylabels="" forvalues i=1/`by_vals' { local bylabel : label `byvar' `i', strict local bylabel=subinstr("`bylabel'"," ","_",.) local bylabels `bylabels' _`bylabel' } mata:saveVariancesToDataset(cov_lag_accum, corr_lag_accum, obs_lag_accum, cov_se_lag_accum, var_total_accum, var_class_accum, var_ind_accum, cov_sameyear_accum, corr_sameyear_accum, obs_sameyear_accum, tokens(st_local("bylabels"))) } else mata:saveVariancesToDataset(cov_lag_accum, corr_lag_accum, obs_lag_accum, cov_se_lag_accum, var_total_accum, var_class_accum, var_ind_accum, cov_sameyear_accum, corr_sameyear_accum, obs_sameyear_accum, "") if (`nooutput'!=1) qui outsheet using `"`output'_variance.csv"', comma replace * Load the correct output dataset tokenize "`data'" if inlist("`1'","preserve","merge") { restore if (`merge_resid'==1) { if ("`byvar'"!="") qui keep if missing(`teacher',`year',`class',`byvar') else qui keep if missing(`teacher',`year',`class') forvalues l=1/`by_vals' { append using `"`resid_data_`l''"', nolabel } } if (`merge_tv'==1) qui merge m:1 `teacher' `year' `by' `output_addvars' using `"`output'"', nogen nolabel /* else "`data'"=="preserve", and that is already loaded. */ } else { restore, not if ("`data'"=="tv") use `"`output'"', clear /* else "`data'"=="variance", and that is already loaded. */ } * Close log if (`nooutput'!=1) log close end version 11 set matastrict on mata: real rowvector computeweights(real matrix M, real scalar i, real colvector c, | real colvector weights) { // construct matrix A which is used to select the relevant elements of M in constructing the VCV matrix real matrix temp real matrix A temp=designmatrix(c) A = temp, J(rows(c),cols(M)-cols(temp),0) // use A to select elements of M and build the VCV. The second term adjusts the diagonal elements of the VCV matrix to account for the class-level and individual-level shocks real matrix vcv if (args()==4) vcv=A*M*A' + diag(1:/weights) else vcv=A*M*A' // phi is the vector of autocovariances, selected correctly using the matrix A. real rowvector phi phi=M[i,.]*A' // return the vector of weights return (phi*invsym(vcv)) } real matrix compute_cov_corr(string scalar scores_var, string scalar weight_var, real scalar dim, string scalar teacher_var) { // pre-allocate matrix real matrix CC CC = J(dim,4,.) // Fill cov's and corr's: between time t and t+i real scalar i real scalar tstat for (i=1; i<=dim; i++) { // check that there are >=2 obs, in order to compute covariance stata(invtokens(("quietly count if !missing(",scores_var,",f",strofreal(i),".",scores_var,")"),"")) if (st_numscalar("r(N)")>1) { stata(invtokens(("quietly corr ",scores_var," f",strofreal(i),".",scores_var," [aw=",weight_var,"+f",strofreal(i),".",weight_var,"], cov"),"")) CC[i,1]=st_numscalar("r(cov_12)") CC[i,2]=CC[i,1] / ( sqrt(st_numscalar("r(Var_1)")) * sqrt(st_numscalar("r(Var_2)")) ) } CC[i,3]=st_numscalar("r(N)") // Compute SE for covariance estimate if (st_numscalar("r(N)")>1) { stata(invtokens(("quietly reg ",scores_var," f",strofreal(i),".",scores_var," [aw=",weight_var,"+f",strofreal(i),".",weight_var,"], cluster(",teacher_var,")"),"")) tstat=st_matrix("e(b)")[1,1] / sqrt( st_matrix("e(V)")[1,1] ) CC[i,4]=abs(CC[i,1]/tstat) } } return (CC) } real rowvector create_m(real colvector lag_covariances, real scalar cov_sameyear, | real scalar lagdim, real scalar driftlimit) { real rowvector m if (args()==2) m=cov_sameyear,lag_covariances' else { if (length(lag_covariances)0) _error("covariance vector contains missing values") } real matrix vectorToStripeDiag(real vector m) { real scalar dim dim = length(m) // pre-allocate matrix M real matrix M M=J(dim,dim,.) // fill lower triangle of M real scalar i real scalar j for (i=1; i<=dim; i++) { for (j=i; j<=dim; j++) { M[j,i]=m[j-i+1] } } _makesymmetric(M) return (M) } real matrix rightAppendMatrices(real matrix A, real matrix B) { real scalar rA real scalar rB rA=rows(A) rB=rows(B) if (rA==rB) return (A,B) else if (rA 0) { st_store(obs,va_var_ind,driftcalc(M,time-year_index,Z_obs[.,2]:-year_index,Z_obs[.,3],Z_obs[.,4])) } if (quasi==1) { // remove the rows of Z_obs corresponding to the previous year Z_quasi=select(Z_obs, Z_obs[.,2]:!=time-1) if (rows(Z_quasi) > 0) { st_store(obs,va_2yr_l_var_ind,driftcalc(M,time-year_index,Z_quasi[.,2]:-year_index,Z_quasi[.,3],Z_quasi[.,4])) } // remove the rows of Z_obs corresponding to the next year Z_quasi=select(Z_obs, Z_obs[.,2]:!=time+1) if (rows(Z_quasi) > 0) { st_store(obs,va_2yr_f_var_ind,driftcalc(M,time-year_index,Z_quasi[.,2]:-year_index,Z_quasi[.,3],Z_quasi[.,4])) } // remove the rows of Z_obs corresponding to {t-2,t-1,t+1,t+2} Z_quasi=select(Z_obs, (Z_obs[.,2]:>time+2)+(Z_obs[.,2]: 0) { st_store(obs,va_ss_var_ind,driftcalc(M,time-year_index,Z_quasi[.,2]:-year_index,Z_quasi[.,3],Z_quasi[.,4])) } } } } } end