*! version 1.2 09dec2017 Michael D Barker Felix Pöge /******************************************************************************* Michael Barker mdb96@georgetown.edu Felix Pöge felix.poege@ip.mpg.de ustrdist.ado file Implements distance calculation between two strings using Stata 14's unicode functions. Uses Levenstein distance as metric. Thanks to Sergio Correia for his suggestions on how to improve strdist's performance. *******************************************************************************/ version 14 *** Check arguments and call appropriate sub-routine program define ustrdist, rclass syntax anything [if] [in] , [Generate(name) MAXdist(integer -1)] if `maxdist' < 1 & `maxdist' != -1 { noi di as text "Invalid argument maxdist (`maxdist'). The Levenstein distance without limitation will be computed." local maxdist 0 } gettoken first remain : anything , qed(isstring1) gettoken second remain : remain , qed(isstring2) if `"`remain'"' != "" error 103 * Two string scalar version if `isstring1' & `isstring2' ustrdist0var `if' `in' , first(`"`first'"') second(`"`second'"') gen(`generate') maxdist(`maxdist') * One or two string variable version else { local strscalar "" if `isstring1' { local strscalar = `"`first'"' local first "" } else if `isstring2' { local strscalar = `"`second'"' local second "" } ustrdist12var `first' `second' `if' `in' , m(`"`strscalar'"') gen(`generate') maxdist(`maxdist') } * Return values generated by subroutines return scalar d = r(d) return local strdist "`r(strdist)'" end *** One or two variable command program define ustrdist12var , rclass syntax varlist(min=1 max=2 string) [if] [in] , [Match(string)] [GENerate(name) MAXdist(integer 0)] tempvar touse mark `touse' `if' `in' if `maxdist' < 1 { local maxdist 0 } *** Declare default and confirm newvarname if `"`generate'"' == "" local generate "strdist" confirm new variable `generate' tokenize "`varlist'" if "`2'"=="" mata: umatalev1var("`1'" , `"`match'"' , "`generate'" , "`touse'", `maxdist') else mata: umatalev2var("`1'" , "`2'" , "`generate'" , "`touse'", `maxdist') return local strdist "`generate'" end *** Two string scalar command program define ustrdist0var, rclass syntax [if] [in] , [first(string) second(string) Generate(name) MAXdist(integer 0)] marksample touse if `maxdist' < 1 { local maxdist 0 } tempname dist mata: st_numscalar("`dist'" , ufastlev(`"`first'"' , `"`second'"', `maxdist')) if `"`generate'"' != "" { confirm new variable `generate' generate int `generate' = `dist' if `touse' return local strdist "`generate'" } display as result `dist' return scalar d = `dist' end mata: /****************************************************************************** Terminology Note key: string to measure each observation against trymatch: one of many potential matches to be measured against the key TRIES: Nx1 vector of all "trymatch"s ******************************************************************************/ void umatalev1var(string scalar varname, string scalar key, string scalar newvar , string scalar touse, | real scalar maxdist) { string colvector TRIES real colvector dist numeric t if (args()<5 | maxdist < 1) maxdist = 0 TRIES = st_sdata(. , varname , touse) // Nx1 string vector with potential matches dist = J(rows(TRIES),1,.) // Nx1 real vector to hold lev distances to each match for (t = 1; t <= rows(TRIES); t++) { dist[t] = ufastlev(key, TRIES[t,1], maxdist) // save distance } st_store(., st_addvar("int", newvar), touse, dist) } void umatalev2var(string scalar var1 , string scalar var2, string scalar newvar , string scalar touse, | real scalar maxdist) { string colvector TRIES string colvector KEYS real colvector dist numeric t if (args()<5 | maxdist < 1) maxdist = 0 KEYS = st_sdata(., var1, touse) TRIES = st_sdata(., var2, touse) // Nx1 string vector with potential matches dist = J(rows(TRIES), 1, .) // Nx1 real vector to hold lev distances to each match for (t = 1; t <= rows(TRIES); t++) { dist[t] = ufastlev(KEYS[t, 1], TRIES[t, 1], maxdist) } st_store(., st_addvar("int", newvar), touse, dist) } real scalar ufastlev(string scalar a, string scalar b, | real scalar maxdist) { real scalar len_a, len_b, i, j, cost real vector v0, v1, chars_a, chars_b if (args()<3 | maxdist < 1) maxdist = . // Ensure that maxdist is positive or missing if (a==b) return(0) // Shortcut if strings are the same len_a = ustrlen(a) len_b = ustrlen(b) if (maxdist < . & abs(len_a - len_b) > maxdist) return(.) // Shortcut if we know we will exceed maxdist if (len_a==0 | len_b==0) return(len_a + len_b) // Shortcut if one is empty // Code based on: // https://en.wikipedia.org/wiki/Levenshtein_distance#Iterative_with_two_matrix_rows v0 = 0::len_b v1 = J(len_b+1, 1, .) // The usual algorithm in strdist (next two lines) does not give correct // results for unicode strings // chars_a = ascii(a) // chars_b = ascii(b) // Adjustment for unicode characters // ascii() returns a list of bytes for unicode characters. // The code below maps this list of bytes to a single integer. Ideally, // there would be a function like ascii() which does this mapping, but I // do not know of any. The mapping does not give back the proper unicode // value, but at least it is bijective. chars_a = J(len_a, 1, .) for (i=1; i<=len_a; i++) { c = ascii(usubstr(a, i, 1)) if (length(c) == 1) { chars_a[i] = c[1] } else if (length(c) == 2) { chars_a[i] = c[1]*256 + c[2] } else if (length(c) == 3) { chars_a[i] = c[1]*65536 + c[2]*256 + c[3] } else if (length(c) == 4) { chars_a[i] = c[1]*16777216 + c[2]*65536 + c[3]*256 + c[4] } else { printf("Unicode should have 1-4 byte groups, but there are %f. Please contact the authors.\n", length(c)) } } chars_b = J(len_b, 1, .) for (i=1; i<=len_b; i++) { c = ascii(usubstr(b, i, 1)) if (length(c) == 1) { chars_b[i] = c } else if (length(c) == 2) { chars_b[i] = c[1]*256 + c[2] } else if (length(c) == 3) { chars_b[i] = c[1]*65536 + c[2]*256 + c[3] } else if (length(c) == 4) { chars_b[i] = c[1]*16777216 + c[2]*65536 + c[3]*256 + c[4] } else { printf("Unicode should have 1-4 byte groups, but there are %f. Please contact the authors.\n", length(c)) } } // Algorithm with stopping condition if (maxdist < .) { for (i=1; i<=len_a; i++) { v1[1] = i for (j=1; j<=len_b; j++) { cost = chars_a[i] != chars_b[j] v1[j + 1] = minmax(( v1[j] + 1, v0[j+1] + 1, v0[j] + cost ))[1] } swap(v0, v1) // If the minimum necessary edit distance is exceeded, stop if (minmax(v0)[1] > maxdist) return(.) } // Final check: Is the edit distance really not exceeded? if (v0[len_b+1] > maxdist) return(.) return(v0[len_b+1]) } // Algorithm without stopping condition else { for (i=1; i<=len_a; i++) { v1[1] = i for (j=1; j<=len_b; j++) { cost = chars_a[i] != chars_b[j] v1[j + 1] = minmax(( v1[j] + 1, v0[j+1] + 1, v0[j] + cost ))[1] } swap(v0, v1) } return(v0[len_b+1]) } } end