*! version 1.0.0 20Sep2018 Mauricio Caceres Bravo, mauricio.caceres.bravo@gmail.com *! -duplicates- implementation using -gegen tag- for faster processing capture program drop gduplicates program gduplicates, rclass version 13.1 local 00 `0' gettoken cmd 0 : 0, parse(" ,") local l = length("`cmd'") * Get subcommand * -------------- if ( `l' == 0 ) { di "{err}subcommand needed; see help on {help gduplicates##|_new:gduplicates}" exit 198 } if ( substr("report", 1, max(1, `l')) == "`cmd'" ) { local cmd "report" } else if ( substr("examples", 1, max(1, `l')) == "`cmd'" ) { local cmd "examples" } else if ( substr("list", 1, max(1, `l')) == "`cmd'" ) { local cmd "list" } else if ( substr("browse", 1, max(1, `l')) == "`cmd'" ) { local cmd "browse" disp "{p 0 0 2}As of Stata 11.0, browse is no longer a valid" /// "{cmd}duplicates subcommand; hence gtools will not support it." /// "{result}See {help duplicates##remarks:Remarks} under help" /// "{helpb duplicates} for an explanation.{p_end}" exit 198 } else if ( substr("tag", 1, max(1, `l')) == "`cmd'" ) { local cmd "tag" } else if ( "drop" == "`cmd'" ) { * OK } else { di "{err}illegal {cmd}gduplicates {err}gsubcommand" exit 198 } * Check syntax * ------------ if ( "`cmd'" == "drop" ) { capture syntax varlist [if] [in], [gtools(str)] if ( _rc == 0 ) { di "{err}force option required with {cmd}gduplicates drop {it}varlist{rm}" exit 198 } capture syntax varlist [if] [in], force [gtools(str)] if ( _rc ) { syntax [varlist] [if] [in], [gtools(str)] unab varlist : _all * local varlist : subinstr local varlist "`_sortindex'" "" local vartext "{txt} all variables" } else local vartext "{res} `varlist'" } else if "`cmd'" == "tag" { syntax [varlist(default=none)] [if] [in], Generate(str) [gtools(str)] capture confirm new variable `generate' if ( _rc ) { di as err "generate() must specify new variable" exit _rc } if ( "`varlist'" == "" ) { unab varlist : _all * local varlist : subinstr local varlist "`_sortindex'" "" local vartext "{txt} all variables" } else local vartext "{res} `varlist'" } else { syntax [varlist(default=none)] [if] [in] [ , SORTed UNSORTed gtools(str) * ] if ( "`varlist'" == "" ) { unab varlist : _all * local varlist : subinstr local varlist "`_sortindex'" "" local vartext "{txt} all variables" } else local vartext "{res} `varlist'" } * Dedup algorithm * --------------- tempvar example Ngroup freq surplus dgroup order /* order 1 up _n when called dgroup 0 if unique on varlist (not a "duplicated" group) 1 up labels groups which share identical values on varlist Ngroup 1 if unique on varlist 2 up is # in each dgroup example 1 to show if showing examples -- and to keep if -drop- 0 to drop if -drop- freq # # in each group surplus # # of surplus observations */ di _n "{p 0 4}{txt}Duplicates in terms of `vartext'{p_end}" * tag - count duplicates by group * ------------------------------- if ( "`cmd'" == "tag" ) { global GTOOLS_DUPS gduplicates cap noi gegen `generate' = count(1) `if' `in', by(`varlist') missing `gtools' global GTOOLS_DUPS "" if ( _rc == 2000 ) { error 2000 } else if ( _rc ) { error _rc } qui replace `generate' = `generate' - 1 qui compress `generate' exit 0 } * report - stats on duplicates * ---------------------------- if ( "`cmd'" == "report" ) { if ( `"`if'"' != "" ) { marksample touse, novarlist local ifin if `touse' `in' } else { mata st_local("ifin", st_local("if") + " " + st_local("in")) } global GTOOLS_DUPS gduplicates * cap noi gegen `Ngroup' = count(1) `ifin', by(`varlist') missing `gtools' cap noi gegen `example' = tag(`varlist') `ifin', counts(`Ngroup') missing `gtools' global GTOOLS_DUPS "" if ( _rc == 2000 ) { error 2000 } else if ( _rc ) { error _rc } return scalar unique_value = `r(J)' global GTOOLS_DUPS gduplicates cap noi gegen `freq' = count(1) `ifin', by(`Ngroup') missing `gtools' global GTOOLS_DUPS "" if ( _rc == 2000 ) { error 2000 } else if ( _rc ) { error _rc } gen `surplus' = `freq' - ( `freq' / `Ngroup' ) label var `Ngroup' "copies" label var `freq' "observations" label var `surplus' "surplus" tabdisp `Ngroup' if `example', cell(`freq' `surplus') local varcount: word count `varlist' exit 0 } * drop * ---- if ( "`cmd'" == "drop" ) { if ( `"`if'`in'"' != "" ) { marksample touse, novarlist local ifin if `touse' `in' } global GTOOLS_DUPS gduplicates cap noi gegen `example' = tag(`varlist') `ifin', missing `gtools' global GTOOLS_DUPS "" if ( _rc == 2000 ) { error 2000 } else if ( _rc ) { error _rc } * bail out now if no duplicates if ( `r(N)' == `r(J)' ) { di _n as txt "(0 observations are duplicates)" exit 0 } di if ( `"`if'`in'"' == "" ) { noisily keep if `example' } else { noisily keep if `example' | !`touse' } exit 0 } * examples or list * ---------------- local opts varlist(`varlist') ifin(`if' `in') cmd(`cmd') * if ( "`unsorted'" == "" ) { if ( "`sorted'" != "" ) { cap noi examplesList, `opts' gtools(`gtools') `options' exit _rc } else { cap noi examplesListUnsorted, `opts' gtools(`gtools') `options' exit _rc } end * Examples and list * ----------------- capture program drop examplesListUnsorted program examplesListUnsorted syntax, varlist(str) cmd(str) [ifin(str asis) gtools(str) noWARNing *] tempvar example Ngroup freq surplus dgroup order global GTOOLS_CALLER ghash local opts missing gfunction(hash) `gtools' local gopts gen(`dgroup') counts(`Ngroup') tag(`example') cap noi _gtools_internal `varlist' `ifin', `gopts' `opts' global GTOOLS_CALLER "" if ( _rc == 17999 ) { duplicates `0' exit 0 } else if ( _rc == 17001 ) { error 2000 } else if ( _rc ) { exit _rc } * bail out now if no duplicates if ( `r(J)' == `r(N)' ) { di _n as txt "(0 observations are duplicates)" exit 0 } else { di _n as txt "`=`r(N)' - `r(J)'' observations are duplicates. Examples:" } if ( `"`warning'"' != "nowarning" ) { disp "({cmd}note: {cmd}`cmd' {txt}left unsorted to improve performance; use option {cmd}sort {txt}to mimic {cmd}duplicates)" } qui replace `dgroup' = 0 if ( `Ngroup' == 1 ) | mi(`dgroup') gen long `order' = _n if ( "`cmd'" == "examples" ) { char `order'[varname] "e.g. obs:" char `dgroup'[varname] "group:" char `Ngroup'[varname] "#" if ( `r(J)' ) > 1 { local lopts subvarname noobs `options' local lvars `dgroup' `Ngroup' `order' `varlist' list `lvars' if `example' & `dgroup', `lopts' } else { local lopts subvarname noobs `options' local lvars `Ngroup' `order' `varlist' list `lvars' if `example' & `dgroup', `lopts' } } else if ( "`cmd'" == "list" ) { char `order'[varname] "obs:" char `dgroup'[varname] "group:" * char `order'[varname] "obs:" if ( `r(J)' > 1 ) { local lopts subvarname noobs `options' local lvars `dgroup' `order' `varlist' list `lvars' if `dgroup', `lopts' } else { list `order' `varlist' if `dgroup', subvarname noobs `options' } } end capture program drop examplesList program examplesList, sortpreserve syntax, varlist(str) cmd(str) [ifin(str asis) gtools(str) noWARNing *] tempvar example Ngroup freq surplus dgroup order global GTOOLS_CALLER ghash local opts missing gfunction(hash) `gtools' local gopts gen(`dgroup') counts(`Ngroup') tag(`example') cap noi _gtools_internal `varlist' `ifin', `gopts' `opts' global GTOOLS_CALLER "" if ( _rc == 17999 ) { duplicates `0' exit 0 } else if ( _rc == 17001 ) { error 2000 } else if ( _rc ) { exit _rc } * bail out now if no duplicates if ( `r(J)' == `r(N)' ) { di _n as txt "(0 observations are duplicates)" exit 0 } else { di _n as txt "`=`r(N)' - `r(J)'' observations are duplicates. Examples:" } qui replace `dgroup' = 0 if ( `Ngroup' == 1 ) | mi(`dgroup') gen long `order' = _n sort `dgroup' `order' if ( "`cmd'" == "examples" ) { char `order'[varname] "e.g. obs:" char `dgroup'[varname] "group:" char `Ngroup'[varname] "#" if ( `r(J)' ) > 1 { local lopts subvarname noobs `options' local lvars `dgroup' `Ngroup' `order' `varlist' list `lvars' if `example' & `dgroup', `lopts' } else { local lopts subvarname noobs `options' local lvars `Ngroup' `order' `varlist' list `lvars' if `example' & `dgroup', `lopts' } } else if ( "`cmd'" == "list" ) { char `order'[varname] "obs:" char `dgroup'[varname] "group:" * char `order'[varname] "obs:" if ( `r(J)' > 1 ) { local lopts subvarname noobs `options' local lvars `dgroup' `order' `varlist' list `lvars' if `dgroup', `lopts' } else { local lopts subvarname noobs `options' local lvars `order' `varlist' list `lvars' if `dgroup', `lopts' } } * disp "{cmd}Warning: {txt}Performance gains are negligible without option {cmd}unsorted" end