// -------------------------------------------------------------------------- // Mata Code: Efficiently compute levels of variables (factors/categories) // -------------------------------------------------------------------------- // Project URL: https://github.com/sergiocorreia/ftools // Miscellanea -------------------------------------------------------------- loc debug 0 loc debug_on = cond(`debug', "on", "off") loc debug_off = cond(`debug', "off", "on") set matadebug `debug_on' mata: mata set matastrict `debug_off' mata: mata set mataoptimize on mata: mata set matadebug `debug_on' mata: mata set matalnum `debug_on' // Versioning --------------------------------------------------------------- ms_get_version ftools // part of this package assert("`package_version'" != "") mata: string scalar ftools_version() return("`package_version'") mata: string scalar ftools_stata_version() return("`c(stata_version)'") mata: string scalar ftools_joint_version() return("`package_version'|`c(stata_version)'") // Includes ----------------------------------------------------------------- findfile "ftools_type_aliases.mata" include "`r(fn)'" findfile "ftools_common.mata" include "`r(fn)'" findfile "ftools_main.mata" include "`r(fn)'" * We have different functions depending on whether cols(data)==1 or >1 findfile "ftools_hash1.mata" loc is_vector 1 include "`r(fn)'" loc is_vector 0 include "`r(fn)'" * Experimental dependency on gtools (with method(gtools)) findfile "ftools_plugin.mata" include "`r(fn)'" //findfile "ftools_experimental.mata" //include "`r(fn)'" findfile "fcollapse_functions.mata" include "`r(fn)'" // Possible Improvements // ---------------------- // 1) Do this in a C plugin; perhaps using khash (MIT-lic) like Pandas // 2) Use a faster hash function like SpookyHash or CityHash (both MIT-lic) // 3) Use double hashing instead of linear/quadratic probing // 4) Compute the hashes in parallel