// lddtest: Logarithmic density discontinuity testing in Stata
// This code is built off McCrary's (2008) DCdensity command, originally sourced from https://eml.berkeley.edu/~jmccrary/DCdensity/ (accessed 30 June 2024)
                                                                     
capture program drop lddtest
program define lddtest, rclass
{
  
  version 9.0
  set more off
  pause on
  syntax varname(numeric) [if/] [in/], breakpoint(real) epsilon(real) [ b(real 0) h(real 0) alpha(real 0.05) at(string) graphname(string) noGRaph]
  
  *Confirm numbers
  confirm number `alpha'
  confirm number `breakpoint'
  confirm number `epsilon'
  
    *If epsilon <= 1...
	if (`epsilon' <= 1) {
		
		*... then stop the function
		display "'epsilon' must be strictly greater than 1"
		exit
		
	}
	
	*If alpha is not between 0 and 0.5...
	if (`alpha' <= 0 | `alpha' >= 0.5) {
		
		*... then stop the function
		display "'alpha' must be between 0 and 0.5"
		exit
		
	}
  
  marksample touse
  
  //Advanced user switch
  //0 - supress auxiliary output  1 - display aux output
  local verbose 0 
 
  //Bookkeeping before calling MATA function
  //"running variable" in terminology of McCrary (2008)
  local R "`varlist'"
  
  local cellmpname = "Xj"
  local cellvalname = "Yj"
  local evalname = "r0"
  local cellsmname = "fhat"
  local cellsmsename = "se_fhat"
  tempname Xj
  confirm new var `Xj'
  tempname Yj
  confirm new var `Yj'
  tempname r0
  capture confirm new var `r0'
  if (_rc!=0 & "`at'"!="r0") error 198
  tempname fhat
  confirm new var `fhat'
  tempname se_fhat
  confirm new var `se_fhat'
  
  //If the user does not specify the evaluation sequence, this it is taken to be the histogram midpoints
  if ("`at'" == "") {
    local at  = "Xj"
  }

  //Call MATA function
  mata: DCdensitysub("`R'", "`touse'", `breakpoint', `b', `h', `verbose', "Xj", "Yj", ///
                     "r0", "fhat", "se_fhat", "`at'", `epsilon', `alpha')

  //Dump MATA return codes into STATA return codes 
  return scalar theta = r(theta)
  return scalar se = r(se)
  return scalar binsize = r(binsize)
  return scalar bandwidth = r(bandwidth)
  return scalar epsilon = r(epsilon)
  return scalar alpha = r(alpha)
  return scalar TOST_lb = r(TOST_lb)
  return scalar TOST_ub = r(TOST_ub)
  return scalar TOST_z = r(TOST_z)
  return scalar TOST_p = r(TOST_p)
  return scalar ECI_lb = r(ECI_lb)
  return scalar ECI_ub = r(ECI_ub)
  

  //if user wants the graph...
  if ("`graph'"!="nograph") { 
    tempvar hi
    quietly gen `hi' = `fhat' + 1.96*`se_fhat'
    tempvar lo
    quietly gen `lo' = `fhat' - 1.96*`se_fhat'
    gr twoway (scatter `Yj' `Xj', msymbol(circle_hollow) mcolor(gray))           ///
      (line `fhat' `r0' if `r0' < `breakpoint', lcolor(black) lwidth(medthick))   ///
        (line `fhat' `r0' if `r0' > `breakpoint', lcolor(black) lwidth(medthick))   ///
          (line `hi' `r0' if `r0' < `breakpoint', lcolor(black) lwidth(vthin))              ///
            (line `lo' `r0' if `r0' < `breakpoint', lcolor(black) lwidth(vthin))              ///
              (line `hi' `r0' if `r0' > `breakpoint', lcolor(black) lwidth(vthin))              ///
                (line `lo' `r0' if `r0' > `breakpoint', lcolor(black) lwidth(vthin)),             ///
                  xline(`breakpoint', lcolor(black)) legend(off)
    if ("`graphname'"!="") {
      di "Exporting graph as `graphname'"
      graph export `graphname', replace
    }
  }
  //Drop newly-created variables
  drop `Xj' `Yj' `r0' `fhat' `se_fhat'
}
end


mata:
mata set matastrict on

void DCdensitysub(string scalar runvar, string scalar tousevar, real scalar c, real scalar b, ///
                  real scalar h, real scalar verbose, string scalar cellmpname, string scalar cellvalname, ///
                  string scalar evalname, string scalar cellsmname, string scalar cellsmsename, ///
                  string scalar atname, real scalar epsilon, real scalar alpha) {
  //   inputs: runvar - name of stata running variable ("R" in McCrary (2008))
  //             tousevar - name of variable indicating which obs to use
  //             c - point of potential discontinuity
  //             b - bin size entered by user (zero if default is to be used)
  //             h - bandwidth entered by user (zero if default is to be used)
  //             verbose - flag for extra messages printing to screen
  //             cellmpname - name of new variable that will hold the histogram cell midpoints
  //             cellvalname - name of new variable that will hold the histogram values
  //             evalname - name of new variable that will hold locations where the histogram smoothing was
  //                        evaluated
  //             cellsmname - name of new variable that will hold the smoothed histogram cell values
  //             cellsmsename - name of new variable that will hold standard errors for smoothed histogram cells
  //             atname - name of existing stata variable holding points at which to eval smoothed histogram

  //declarations for general use and histogram generation
  real colvector run						// stata running variable
  string scalar statacom					// string to hold stata commands
  real scalar errcode                                           // scalar to hold return code for stata commands
  real scalar rn, rsd, rmin, rmax, rp75, rp25, riqr     	// scalars for summary stats of running var
  real scalar l, r						// midpoint of lowest bin and highest bin in histogram
  real scalar lc, rc						// midpoint of bin just left of and just right of breakpoint
  real scalar j							// number of bins spanned by running var
  real colvector binnum						// each obs bin number
  real colvector cellval					// histogram cell values
  real scalar i							// counter
  real scalar cellnum						// cell value holder for histogram generation
  real colvector cellmp						// histogram cell midpoints

  //Set up histogram grid

  st_view(run, ., runvar, tousevar)     //view of running variable--only observations for which `touse'=1

  //Get summary stats on running variable
  statacom = "quietly summarize " + runvar + " if " + tousevar + ", det"
  errcode=_stata(statacom,1)
  if (errcode!=0) {
    "Unable to successfully execute the command "+statacom
    "Check whether you have given Stata enough memory"
  }
  rn = st_numscalar("r(N)")
  rsd = st_numscalar("r(sd)")
  rmin = st_numscalar("r(min)")
  rmax = st_numscalar("r(max)")
  rp75 = st_numscalar("r(p75)") 
  rp25 = st_numscalar("r(p25)")
  riqr = rp75 - rp25

  if ( (c<=rmin) | (c>=rmax) ) {
    printf("Breakpoint must lie strictly within range of running variable\n")
    _error(3498)
  }
  
  //set bin size to default in paper sec. IIIB unless provided by the user
  if (b == 0) {
    b = 2*rsd*rn^(-1/2)
    if (verbose) printf("Using default bin size calculation, bin size = %f\n", b)
  }

  //bookkeeping
  l = floor((rmin-c)/b)*b+b/2+c  // midpoint of lowest bin in histogram
  r = floor((rmax-c)/b)*b+b/2+c  // midpoint of lowest bin in histogram
  lc = c-(b/2) // midpoint of bin just left of breakpoint
  rc = c+(b/2) // midpoint of bin just right of breakpoint
  j = floor((rmax-rmin)/b)+2

  //create bin numbers corresponding to run... See McCrary (2008, eq 2)
  binnum = round((((floor((run :- c):/b):*b:+b:/2:+c) :- l):/b) :+ 1)  // bin number for each obs

  //generate histogram 
  cellval = J(j,1,0)  // initialize cellval as j-vector of zeros
  for (i = 1; i <= rn; i++) {
    cellnum = binnum[i]
    cellval[cellnum] = cellval[cellnum] + 1
  }
  
  cellval = cellval :/ rn  // convert counts into fractions
  cellval = cellval :/ b  // normalize histogram to integrate to 1
  cellmp = range(1,j,1)  // initialize cellmp as vector of integers from 1 to j
  cellmp = floor(((l :+ (cellmp:-1):*b):-c):/b):*b:+b:/2:+c  // convert bin numbers into cell midpoints
  
  //place histogram info into stata data set
  real colvector stcellval					// stata view for cell value variable
  real colvector stcellmp					// stata view for cell midpoint variable

  (void) st_addvar("float", cellvalname)
  st_view(stcellval, ., cellvalname)
  (void) st_addvar("float", cellmpname)
  st_view(stcellmp, ., cellmpname)
  stcellval[|1\j|] = cellval
  stcellmp[|1\j|] = cellmp
  
  //Run 4th order global polynomial on histogram to get optimal bandwidth (if necessary)
  real matrix P							// projection matrix returned from orthpoly command
  real matrix betaorth4						// coeffs from regression of orthogonal powers of cellmp
  real matrix beta4						// coeffs from normal regression of powers of cellmp
  real scalar mse4						// mean squared error from polynomial regression
  real scalar hleft, hright					// bandwidth est from polynomial left of and right of breakpoint
  real scalar leftofc, rightofc	      			        // bin number just left of and just right of breakpoint
  real colvector cellmpleft, cellmpright			// cell midpoints left of and right of breakpoint
  real colvector fppleft, fppright				// fit second deriv of hist left of and right of breakpoint

  //only calculate optimal bandwidth if user hasn't provided one
  if (h == 0) {
    //separate cells left of and right of the cutoff
    leftofc =  round((((floor((lc - c)/b)*b+b/2+c) - l)/b) + 1) // bin number just left of breakpoint
    rightofc = round((((floor((rc - c)/b)*b+b/2+c) - l)/b) + 1) // bin number just right of breakpoint
    if (rightofc-leftofc != 1) {
      printf("Error occurred in optimal bandwidth calculation\n")
      _error(3498)
    }
    cellmpleft = cellmp[|1\leftofc|]
    cellmpright = cellmp[|rightofc\j|]

    //estimate 4th order polynomial left of the cutoff
    statacom = "orthpoly " + cellmpname + ", generate(" + cellmpname + "*) deg(4) poly(P)"
    errcode=_stata(statacom,1)
    if (errcode!=0) {
      "Unable to successfully execute the command "+statacom
      "Check whether you have given Stata enough memory"
    }
    P = st_matrix("P")
    statacom = "reg " + cellvalname + " " + cellmpname + "1-" + cellmpname + "4 if " + cellmpname + " < " + strofreal(c)
    errcode=_stata(statacom,1)
    if (errcode!=0) {
      "Unable to successfully execute the command "+statacom
      "Check whether you have given Stata enough memory"
    }
    mse4 = st_numscalar("e(rmse)")^2
    betaorth4 = st_matrix("e(b)")
    beta4 = betaorth4 * P
    fppleft = 2*beta4[2] :+ 6*beta4[3]:*cellmpleft + 12*beta4[4]:*cellmpleft:^2
    hleft = 3.348 * ( mse4*(c-l) / sum( fppleft:^2) )^(1/5)

    //estimate 4th order polynomial right of the cutoff
    P = st_matrix("P")
    statacom = "reg " + cellvalname + " " + cellmpname + "1-" + cellmpname + "4 if " + cellmpname + " > " + strofreal(c)
    errcode=_stata(statacom,1)
    if (errcode!=0) {
      "Unable to successfully execute the command "+statacom
      "Check whether you have given Stata enough memory"
    }
    mse4 = st_numscalar("e(rmse)")^2
    betaorth4 = st_matrix("e(b)")
    beta4 = betaorth4 * P
    fppright = 2*beta4[2] :+ 6*beta4[3]:*cellmpright + 12*beta4[4]:*cellmpright:^2
    hright = 3.348 * ( mse4*(r-c) / sum( fppright:^2) )^(1/5)
    statacom = "drop " + cellmpname + "1-" + cellmpname + "4"
    errcode=_stata(statacom,1)
    if (errcode!=0) {
      "Unable to successfully execute the command "+statacom
      "Check whether you have given Stata enough memory"
    }

    //set bandwidth to average of calculations from left and right
    h = 0.5*(hleft + hright)
    if (verbose) printf("Using default bandwidth calculation, bandwidth = %f\n", h)
  }

  //Add padding zeros to histogram (to assist smoothing)
  real scalar padzeros						// number of zeros to pad on each side of hist
  real scalar jp						// number of histogram bins including padded zeros

  padzeros = ceil(h/b)  // number of zeros to pad on each side of hist
  jp = j + 2*padzeros
  if (padzeros >= 1) {
    //add padding to histogram variables
    cellval = ( J(padzeros,1,0) \ cellval \ J(padzeros,1,0) )
    cellmp = ( range(l-padzeros*b,l-b,b) \ cellmp \ range(r+b,r+padzeros*b,b) )
    //dump padded histogram variables out to stata
    stcellval[|1\jp|] = cellval
    stcellmp[|1\jp|] = cellmp
  }

  //Generate point estimate of discontinuity
  real colvector dist						// distance from a given observation
  real colvector w						// triangle kernel weights
  real matrix XX, Xy						// regression matrcies for weighted regression
  real rowvector xmean, ymean					// means for demeaning regression vars
  real colvector beta						// regression estimates from weighted reg.
  real colvector ehat						// predicted errors from weighted reg.
  real scalar fhatr, fhatl					// local linear reg. estimates at discontinuity
                                                                //   estimated from right and left, respectively
  real scalar thetahat						// discontinuity estimate
  real scalar sethetahat					// standard error of discontinuity estimate
  
  //Estimate left of discontinuity
  dist = cellmp :- c  // distance from potential discontinuity
  w = rowmax( (J(jp,1,0), (1:-abs(dist:/h))) ):*(cellmp:<c)  // triangle kernel weights for left
  w = (w:/sum(w)) :* jp  // normalize weights to sum to number of cells (as does stata aweights)
  xmean = mean(dist, w)
  ymean = mean(cellval, w)
  XX = quadcrossdev(dist,xmean,w,dist,xmean)    //fixed error on 11.17.2009
  Xy = quadcrossdev(dist,xmean,w,cellval,ymean)
  beta = invsym(XX)*Xy
  beta = beta \ ymean-xmean*beta
  fhatl = beta[2,1]
  
  //Estimate right of discontinuity
  w = rowmax( (J(jp,1,0), (1:-abs(dist:/h))) ):*(cellmp:>=c)  // triangle kernel weights for right
  w = (w:/sum(w)) :* jp  // normalize weights to sum to number of cells (as does stata aweights)
  xmean = mean(dist, w)
  ymean = mean(cellval, w)
  XX = quadcrossdev(dist,xmean,w,dist,xmean)   //fixed error on 11.17.2009
  Xy = quadcrossdev(dist,xmean,w,cellval,ymean)
  beta = invsym(XX)*Xy
  beta = beta \ ymean-xmean*beta
  fhatr = beta[2,1]
  
  //Generate testing bounds
  TOST_lb = -ln(epsilon)
  TOST_ub = ln(epsilon)
  
  //Calculate and display testing results
  thetahat = ln(fhatr) - ln(fhatl)
  sethetahat = sqrt( (1/(rn*h)) * (24/5) * ((1/fhatr) + (1/fhatl)) )
  
  //If LDD > 0...
  if (thetahat > 0) {
  	
	//... then Ha: LDD <= TOST_ub
	TOST_z = (thetahat - TOST_ub)/sethetahat
	TOST_p = normal(TOST_z)
	
  }
  //If LDD <= 0...
  if (thetahat <= 0) {
  	
	//... then Ha: LDD >= TOST_lb
	TOST_z = (thetahat - TOST_lb)/sethetahat
	TOST_p = 1 - normal(TOST_z)
	
  }
  
  //Generate ECI bounds
  ECI_lb = thetahat - invnormal(1 - alpha)*sethetahat
  ECI_ub = thetahat + invnormal(1 - alpha)*sethetahat
  ECI_pct = (1 - alpha)*100
  test_pct = alpha*100
  
  printf("\n        Discontinuity estimate (log difference in height): %f\n", thetahat)
  printf("                                                       SE: (%f)\n\n", sethetahat)
  printf("100 x (1 - alpha) percent equivalence confidence interval: [%f, %f]\n", ECI_lb, ECI_ub)
  printf("                                                    alpha: %f\n\n", alpha)
  printf("                                           Testing bounds: [%f, %f]\n", TOST_lb, TOST_ub)
  printf("                          Equivalence testing z-statistic: %f\n", TOST_z)
  printf("                              Equivalence testing p-value: %f\n\n", TOST_p)
  if (TOST_p <= alpha) {
  	
	printf("The discontinuity in the running variable's density estimates at the cutoff can be significantly bounded beneath a ratio of %f at a %f percent significance level.", epsilon, test_pct)
	
  }
  if (TOST_p > alpha) {
  	
	printf("The discontinuity in the running variable's density estimates at the cutoff can NOT be significantly bounded beneath a ratio of %f at a %f percent significance level.", epsilon, test_pct)
	
  }

  loopover=1 //This is an advanced user switch to get rid of LLR smoothing
  //Can be used to speed up simulation runs--the switch avoids smoothing at
  //eval points you aren't studying
  
  //Perform local linear regression (LLR) smoothing
  if (loopover==1) {
    real scalar cellsm						// smoothed histogram cell values
    real colvector stcellsm					// stata view for smoothed values
    real colvector atstata					// stata view for at variable (evaluation points)
    real colvector at						// points at which to evaluate LLR smoothing
    real scalar evalpts						// number of evaluation points
    real colvector steval						// stata view for LLR smothing eval points

    // if evaluating at cell midpoints
    if (atname == cellmpname) {  
      at = cellmp[|padzeros+1\padzeros+j|]
      evalpts = j
    }
    else {
      st_view(atstata, ., atname)
      evalpts = nonmissing(atstata)
      at = atstata[|1\evalpts|]
    }
    
    if (verbose) printf("Performing LLR smoothing.\n")
    if (verbose) printf("%f iterations will be performed \n",j)
    
    cellsm = J(evalpts,1,0)  // initialize smoothed histogram cell values to zero
    // loop over all evaluation points
    for (i = 1; i <= evalpts; i++) {
      dist = cellmp :- at[i]
      //set weights relative to current bin - note comma below is row join operator, not two separate args
      w = rowmax( (J(jp,1,0), ///
        (1:-abs(dist:/h))):*((cellmp:>=c)*(at[i]>=c):+(cellmp:<c):*(at[i]<c)) )
      //manually obtain weighted regression coefficients
      w = (w:/sum(w)) :* jp  // normalize weights to sum to N (as does stata aweights)
      xmean = mean(dist, w)
      ymean = mean(cellval, w)
      XX = quadcrossdev(dist,xmean,w,dist,xmean)  //fixed error on 11.17.2009 
      Xy = quadcrossdev(dist,xmean,w,cellval,ymean)
      beta = invsym(XX)*Xy
      beta = beta \ ymean-xmean*beta
      cellsm[i] = beta[2,1]
      //Show dots
      if (verbose) {
        if (mod(i,10) == 0) {
          printf(".")
          displayflush()
          if (mod(i,500) == 0) {
            printf(" %f LLR iterations\n",i)
            displayflush()
          }
        }
      }
    }
    printf("\n")
  
    //set up stata variable to hold evaluation points for smoothed values
    (void) st_addvar("float", evalname)
    st_view(steval, ., evalname)
    steval[|1\evalpts|] = at

    //set up stata variable to hold smoothed values
    (void) st_addvar("float", cellsmname)
    st_view(stcellsm, ., cellsmname)
    stcellsm[|1\evalpts|] = cellsm
    
    //Calculate standard errors for LLR smoothed values
    real scalar m					// amount of kernel being truncated by breakpoint
    real colvector cellsmse				// standard errors of smoothed histogram
    real colvector stcellsmse				// stata view for cell midpoint variable
    cellsmse = J(evalpts,1,0)  // initialize standard errors to zero
    for (i = 1; i <= evalpts; i++) {
      if (at[i] > c) {
        m = max((-1, (c-at[i])/h))
        cellsmse[i] = ((12*cellsm[i])/(5*rn*h))* ///
          (2-3*m^11-24*m^10-83*m^9-72*m^8+42*m^7+18*m^6-18*m^5+18*m^4-3*m^3+18*m^2-15*m)/ ///
            (1+m^6+6*m^5-3*m^4-4*m^3+9*m^2-6*m)^2
        cellsmse[i] = sqrt(cellsmse[i])
      }
      if (at[i] < c) {
        m = min(((c-at[i])/h, 1))
        cellsmse[i] = ((12*cellsm[i])/(5*rn*h))* ///
          (2+3*m^11-24*m^10+83*m^9-72*m^8-42*m^7+18*m^6+18*m^5+18*m^4-3*m^3+18*m^2+15*m)/ ///
            (1+m^6-6*m^5-3*m^4+4*m^3+9*m^2+6*m)^2
        cellsmse[i] = sqrt(cellsmse[i])
      }
    }
    //set up stata variable to hold standard errors for smoothed values
    (void) st_addvar("float", cellsmsename)
    st_view(stcellsmse, ., cellsmsename)
    stcellsmse[|1\evalpts|] = cellsmse
  }
  //End of loop over evaluation points
  
  //Fill in STATA return codes
  st_rclear()
  st_numscalar("r(theta)", thetahat)
  st_numscalar("r(se)", sethetahat)
  st_numscalar("r(binsize)", b)
  st_numscalar("r(bandwidth)", h)
  st_numscalar("r(epsilon)", epsilon)
  st_numscalar("r(alpha)", alpha)
  st_numscalar("r(TOST_lb)", TOST_lb)
  st_numscalar("r(TOST_ub)", TOST_ub)
  st_numscalar("r(TOST_z)", TOST_z)
  st_numscalar("r(TOST_p)", TOST_p)
  st_numscalar("r(ECI_lb)", ECI_lb)
  st_numscalar("r(ECI_ub)", ECI_ub)
}

end