/* Stata interface to n-gram counting C++ code */ program _ngram, plugin program define ngram_core, rclass *! 1.0.0 April 2017 version 13 syntax varname(string) [if] [in], [DEGree(int 1) THRESHold(int 5) Prefix(str) BINarize LOcale(str) STOPwords(str) STEMmer noLower PUNCTuation non_token] // fill in defaults for the str arguments (which 'syntax' doesn't let us do) if("`locale'"=="") { // locale is read from Stata14+'s `. set locale_functions` config value. // if we're on an older Stata then locale will just end up empty, and this is tolerable: // if the wrong results come out the user can specify locale() explicitly local locale = "`c(locale_functions)'" } parse_locale "`locale'" local language = "`r(language)'" local region = "`r(region)'" // parse the stopwords argument if(`"`stopwords'"'=="") { // if not given, default to the stopwords file for the current language // *if this file is missing*, give a warning but continue with no stopwords // (but if the file is available but unreadable or something else goes wrong, error out) capture noisily load_stopwords "`language'" if(_rc == 0) { local stopwords = `"`r(stopwords)'"' } else { if(_rc == 7) { // missing file. load_stopwords has warned about this. // we need to let this be okay because most languages don't have stopwords files. } else { exit _rc } } } else if(`"`stopwords'"' == ".") { // special case: a single "." means "don't use any stopwords" // we cannot use "" for this case as that is the only default value Stata allows for str options, so it's taken. local stopwords = "" } else { gettoken plus : stopwords if("`plus'"=="+") { // a + token means "append the given stopwords to the default stopwords" gettoken plus stopwords : stopwords //shift the + off; this looks redudant but it's because the previous gettoken was like a "peek()" capture noisily load_stopwords "`language'" di as txt "Also removing stopwords:`stopwords'" // Careful: this must happen before the prepend on the next line // Careful: the gettoken always leaves an extra space up front, // which is why we *don't* leave a space here. local stopwords = "`r(stopwords)' `stopwords'" } else { // stopwords is (already) an explicit list di as txt "Removing stopwords: `stopwords'" } } //di as txt "stopwords = `stopwords'" //DEBUG /* normalize the flags; the way the plugin detects them is by checking if the string it gets for each is empty or not, but Stata's cute way of doing default-trues (where it gives you "nopunctuation" breaks that */ if("`lower'"=="nolower") { local lower = "" } else { local lower = "lower" } if("`prefix'"=="") { // default prefix is 't' for "text" // the point of the prefix is to avoid most naming conflicts with other Stata variables and with Stata keywords. // and as a side effect, to make it easy to select all the text-mined columns (or even to do a second batch of parsing) // we force the effort onto the end user because there is no totally reliable way to make safely quoted arbitrary variable names in Stata, // especially not when you're text-mining and any quoting choice you might make could just overlap the quoting of another distinct string // (e.g. "ba.ll" and "ba;ll" both quote to "ba_ll") local prefix = "t_" } // do the main batch of parsing, caching the results *in C* tempvar l gen int `l' = strlen(`varlist') plugin call _ngram `varlist' `l' `if' `in', parse "`degree'" "`locale'" "`stemmer'" `"`stopwords'"' "`lower'" "`punctuation'" "`threshold'" drop `l' local valid_vars = "" //list of only the variables that will survive the end of this routine local vars = "" //list of all variables, including the invalid ones, because we need to pass precisely these back into the plugin a second time foreach word of local words { //di as txt "word = '`word''" // construct // and remember to sanitize to make valid Stata variables local var = strtoname(`"`prefix'`word'"') /* summarily get rid of `var'. If it exists we would want to update it to have the new counts instead, anyway. */ cap drop `var' /* make a new variable for counting `word' */ capture noisily qui generate int `var' = 0 if(_rc == 0) { label variable `var' `"# of `word' in '`varlist''"' //(contains error; unsure what to do) label variable `var' "# of '`word'' in '`varlist''" local valid_vars = "`valid_vars' `var'" } else if(_rc == 900) { // out of space (> c(maxvar)). give up. continue, break } else { // strtoname() *does not* detect the cases where the name is a Stata keyword, // and there is no way to quote a variable name to shove keywords in. // And the only ways we can think to detect this case are // - a giant list of keywords (tedious and fragile) // - exceptions-as-returns (unclean) // Obviously, we're using the latter. di as error "Stata rejected '`var'' as a variable name. It will be dropped." // We still need to put a column in place for the C code ("_ngram, export") to work // (if not, the counts would all be shifted!), so we use a Stata-generated tempvar tempvar var capture noisily qui generate int `var' = 0 // I ~believe~ how tempvar works is that at an "end" all tempvars get erased from the dataset. // This is perfect for our purposes: the columns will all be in the right places for "_ngram, export" // but the bum variable will get lost. It will mostly work but the user will just have to accept that using Stata instead of a more modern language hurts them. // // Most of the problem words will be on the Stopwords list anyway, but if the user overrides that or is, say, // using the Dutch stemmer but has a mixed English-Dutch corpus, keywords could slip in and we have to handle them. } /* collect the new variable */ // note: this is only done *after* the if-else because the if-else has a break in it that means `var' is invalid local vars = "`vars' `var'" } local vars = trim("`vars'") local valid_vars = trim("`valid_vars'") if("`n_token'"!="non_token") { local n_token = "n_token" qui capture gen `n_token' = 0 if(_rc == 0) { // if we ran out of space above, we'll run out of space again here; tolerate this case label variable `n_token' "Total number of words in '`varlist''" } } else { local n_token = "" } // fill in those word count columns and possibly the number-of-tokens column plugin call _ngram `vars' `n_token' `if' `in', export // dichotomize responses, if requested if("`binarize'"!="" & "`vars'" != "") { foreach var of varlist `vars' { qui replace `var' = `var' > 0 } } // report the newly created variables to the caller return clear return local words = "`valid_vars'" exit `__rc' end program define load_stopwords, rclass version 13 gettoken language : 0 local stopwords = "" capture qui findfile "stopwords_`language'.txt" if(_rc==0) { local using = "`r(fn)'" } else { di as error "Warning: Unable to find stopwords file 'stopwords_`language'.txt'. Stopwords will not be loaded." exit 7 } tempname fd file open `fd' using "`using'", read text file read `fd' line while(r(eof) == 0) { local stopwords = `"`stopwords' `line'"' file read `fd' line } // BEWARE: UI creep: this subroutine shouldn't know about "Removing", // but it's the only place that knows about the filename di as txt "Removing stopwords specified in stopwords_`language'.txt" return clear return local stopwords = `"`stopwords'"' end // parse a libicu (level 2 canonicalization) locale ID into // as defined at http://userguide.icu-project.org/locale#TOC-Canonicalization // ..more or less. This doesn't cover all cases. // Beware: this is duplicated in _ngram.cpp:parse_libicu_locale_() program define parse_locale, rclass version 13 gettoken locale : 0 plugin call _ngram, parse_libicu_locale "`locale'" return clear return local language = "`language'" return local region = "`region'" end