* v1 by Ryan Knight 30jun2011 program define cfby, rclass version 10.1 syntax [varlist] using/ , by(varname) ID(varname) [noPunct Upper Lower noMATCH noSTRING] cap isid `id' if _rc { duplicates tag `id' , gen(_iddup) di as err "Variable `id' does not uniquely identify the following observations in the master data" list `id' `altid' if _iddup exit 459 } if "`upper'`lower'`punct'" != "" & "`string'" != "" { di as err "`upper' `lower' `punct' may not be used with the nostrings option" exit 198 } preserve quietly { if "`string'" != "" { ds `varlist' , has(type string) local str `r(varlist)' local varlist: list varlist - str } keep `id' `varlist' `by' ds `id', not local varlistm `r(varlist)' foreach X in `varlistm' { rename `X' _cf`X' } tempfile master save `master' use "`using'", clear cap isid `id' noisily if _rc { duplicates tag `id' , gen(_iddup) di as err "Variable `id' does not uniquely identify the following observations in the using data" list `id' `altid' if _iddup exit 459 } * List variables occuring only in 1 dataset ds `id', not if "`string'" != "" { ds `r(varlist)' , has(type numeric) } local varlistuall `r(varlist)' local varlistu: list varlistuall & varlist local onlym: list varlistm - varlistu local onlym: list onlym - by noisily if "`onlym'" !="" { di _newline as txt "The following variables are not in the using dataset" foreach c in `onlym' { di as res "`c'" } } local onlyu: list varlistuall - varlistm local onlyu: list onlyu - by noisily if "`onlyu'" !="" { di _newline as txt "The following variables are not in the master dataset" foreach c in `onlyu' { di as res "`c'" } } local varlist: list varlistm & varlistu local varlist: list varlist - by keep `id' `varlist' `by' tempfile tmpuse save `tmpuse' use `master', clear merge `id' using `tmpuse', sort * List missing observations if "`match'"=="" { count if _merge==1 local musen `r(N)' if `musen' > 0 { tempvar muse gen `muse'=1 if _merge==1 sort `muse' noisily di _newline as err "The following observations are only in the master dataset:" _newline /// as txt "`id':" forvalues i=1/`musen' { noisily di as res `id'[`i'] } } count if _merge==2 local mmasn `r(N)' if `mmasn' >0 { tempvar mmas gen `mmas'=1 if _merge==2 sort `mmas' noisily di _newline as err "The following observations are only in the using dataset:" _newline /// as txt "`id':" forvalues i=1/`mmasn' { noisily di as res `id'[`i'] } } } keep if _merge ==3 // Only compare those with 2 entries to keep discrepancies reasonable drop _merge * Format strings if "`upper'`lower'`punct'" != "" & "`string'" == "" { qui ds , has(type string) local strings `r(varlist)' local stringsnoid: list strings - id foreach X of varlist `stringsnoid' { if "`upper'" != "" { replace `X' = upper(`X') } if "`lower'" != "" { replace `X' = lower(`X') } if "`punct'" != "" { replace `X' = subinstr(`X', "." , " " , .) replace `X' = subinstr(`X', "," , " " , .) replace `X' = subinstr(`X', "!" , "" , .) replace `X' = subinstr(`X', "?" , "" , .) replace `X' = subinstr(`X', "'" , "" , .) replace `X' = subinstr(`X', "--" , " " , .) replace `X' = subinstr(`X', "/" , " " , .) replace `X' = subinstr(`X', ";" , " " , .) replace `X' = subinstr(`X', ":" , " " , .) replace `X' = subinstr(`X', "(" , " " , .) replace `X' = subinstr(`X', ")" , " " , .) replace `X' = trim(`X') replace `X' = itrim(`X') } } } * Loop through questions, saving differences by ob tempvar totaldiffs qui gen `totaldiffs'=0 unab varlist: `varlist' foreach X in `varlist' { replace `totaldiffs'=`totaldiffs' + 1 if `X'!=_cf`X' if _rc { local diftype `diftype' `X' } } * Get total number of Qs by ob local remaining: list varlist - diftype local varcount: word count `remaining' tempvar totalqs gen `totalqs' = `varcount' * Get list of unique values for by var levelsof `by', local(usevals) levelsof _cf`by', local(masvals) local usecount: word count `usevals' local mascount: word count `masvals' * Create matrices for errors & questions by operator matrix def A = J(`usecount',`mascount',.) matrix def B = J(`usecount',`mascount',.) matrix def C = J(`usecount',`mascount',.) forvalues i=1/`usecount' { forvalues j=1/`mascount' { local useby: word `i' of `usevals' local masby: word `j' of `masvals' count if (`by'==`useby' & _cf`by'==`masby') | (_cf`by'==`useby' & `by'==`masby') if `r(N)' > 0 { sum `totaldiffs' if (`by'==`useby' & _cf`by'==`masby') | (_cf`by'==`useby' & `by'==`masby') local errors = r(sum) sum `totalqs' if (`by'==`useby' & _cf`by'==`masby') | (_cf`by'==`useby' & `by'==`masby') local questions = r(sum) matrix A[`i',`j'] = `errors'/`questions' matrix B[`i',`j'] = `errors' matrix C[`i',`j'] = `questions' } } } * Set row/column names local uselbls `usevals' local maslbls `masvals' foreach c in A B C { matrix rownames `c' = `uselbls' matrix roweq `c' = `by' matrix colnames `c' = `maslbls' matrix coleq `c' = `by' } } di _newline _dup(35) as txt "_" mat list A, title( "Discrepency rates by `by'") di _newline _dup(35) as txt "_" mat list B, title( "Discrepencies by `by'") di _newline _dup(35) as txt "_" mat list C, title( "Data points by `by'") if "`diftype'" !="" { di _newline as err "The following variables were not compared because they have a different string/numeric type in master/using:" di as res "`diftype'" } * Display overall discrepency rate qui sum `totaldiffs' local e = r(sum) qui sum `totalqs' local q = r(sum) di _newline _dup(35) as txt "_" _newline as txt "Total Discrepancies: " as res (`e') di as txt "Total Data Points Compared: " as res `q' di as txt "Percent Discrepancies: " %6.3f as res (`e')/`q'*100 as txt " percent" di _dup(35) as txt "_" if "`messyvars'"!="" | "`diftype'" !="" { di as err "Note: Not all variables in varlist compared." } if "`match'"=="" { if "`musen'"!="0" { di as err "Note: Not all observations compared; observations are missing in using data" } if "`mmasn'"!="0" { di as err "Note: Not all observations compared; observations are missing in master data" } } return matrix e = B return matrix q = C restore end