*! mm_txttool (wrapper for text mining tools) 1.1 Unislawa Williams 24Dec2013 version 10 mata: function mm_txttool(string scalar txtdata, | string scalar noclean, string scalar stem, /// string scalar stopwordlist, string scalar subwordlist, string scalar genfield, /// string scalar prefix, string scalar nooutput, string scalar touse) { real scalar i string v, sentx, owordlist, vsform /// create data from text field v = st_sdata(.,txtdata, touse) /// original count of total words and unique words if (nooutput !="nooutput") { /// stack all the words in each case for (i=1;i<=rows(v);i++) { sentx=(tokens(v[i])) if (i==1) { owordlist = sentx' } else { owordlist = owordlist \ sentx' } } /// output the counts of the stacked words st_local("ototwords",strofreal(rows(owordlist))) st_local("ouwords",strofreal(rows(uniqrows(owordlist)))) } /// call clean, if requested if (noclean !="noclean") { v = cleantxt(v) } /// call subwords, if requested if (subwordlist!="") { v = subwords(v,subwordlist) } /// call stopwords, if requested if (stopwordlist!="") { v = stopwords(v,stopwordlist) } /// call stemmer, if requested if (stem=="stem") { v = stemcolumn(v) } /// generate new text field, if requested if (genfield !=txtdata) { /// create format for new txtfield with the length of the maximum line of text vsform = "str"+strofreal(colmax(strlen(v))) /// add variables with correct length and store the text field (void) st_addvar(vsform,genfield) st_sstore(.,genfield, touse, v) } /// replace existing text field, if requested if (genfield ==txtdata) { st_sstore(.,genfield, touse, v) } /// bag the words, if requested if (prefix != "") { wordbag(v,prefix,touse) } /// final count of total words and unique words, if desired if (nooutput !="nooutput") { for (i=1;i<=rows(v);i++) { sentx=(tokens(v[i])) if (i==1) { owordlist = sentx' } else { owordlist = owordlist \ sentx' } } st_local("ftotwords",strofreal(rows(owordlist))) st_local("fuwords",strofreal(rows(uniqrows(owordlist)))) } } end *! cleantxt (removing special characters and white space) 1.0.1 Unisia Williams 08Sep2013 mata: string cleantxt(string matrix txtfield ) { real scalar i, q /// start loop through rows of text for (i=1;i<=rows(txtfield);i++) { /// make lower case and remove extra white space txtfield[i] = strtrim(stritrim(strlower(txtfield[i]))) /// keep characters 32 (white space), 48-57 (numerals) and 97-122 (lower case letters) /// chars 65-90 (capital letters) can be skipped after using strlower for (q=1;q<=31;q++) { txtfield[i] = subinstr(txtfield[i],char(q),"") } for (q=33;q<=47;q++) { txtfield[i] = subinstr(txtfield[i],char(q),"") } for (q=58;q<=64;q++) { txtfield[i] = subinstr(txtfield[i],char(q),"") } for (q=91;q<=96;q++) { txtfield[i] = subinstr(txtfield[i],char(q),"") } for (q =123;q<=255;q++) { txtfield[i] = subinstr(txtfield[i],char(q),"") } } return(txtfield) } end *! subwords (substitution of words) 1.0.1 Unisia Williams 08Sep2013 mata: string subwords(string vector txtfield, string scalar subwordfile) { real scalar i, j string subwordmat, subbedtxt /// grab list of substitutions from file subwordlist = cat(subwordfile) /// parse the list of tab-delimited substitutions into Nx2 matrix called subwordmat t = tokeninit("", char(9), "", 0, 0) subwordmat= J(rows(subwordlist),2, "") for (i=1;i<=rows(subwordlist);i++) { tokenset(t,subwordlist[i]) j=1 while ((token = tokenget(t))!="") { if (token==char(9)) j++ else subwordmat[i,j] = token } } subbedtxt = txtfield /// loop through i rows of txtfield and substitute j subword for (i=1;i<=rows(subbedtxt);i++) { for (j=1;j<=rows(subwordmat);j++) { subbedtxt[i] = strtrim(stritrim(subinword(subbedtxt[i],subwordmat[j,1],subwordmat[j,2]))) } } return(subbedtxt) } end *! stopwords (removal of listed words) 1.0.1 Unisia Williams 08Sep2013 mata: string stopwords(string vector txtfield, string scalar stopwordfile) { real scalar i, j string stopwordlist, stoppedtxt /// grab list of stopwords from file stopwordlist = cat(stopwordfile) stoppedtxt = txtfield /// loop through i rows of txtfield and clean j stopword for (i=1;i<=rows(stoppedtxt);i++) { for (j=1;j<=rows(stopwordlist);j++) { stoppedtxt[i] = strtrim(stritrim(subinword(stoppedtxt[i],stopwordlist[j],""))) } } return(stoppedtxt) } end *! porterstem (implementing Porter's 1980 word stemming procedure) 1.8.1 Unisia Williams 08Sep2013 *! See Porter (1980) for explanation of steps mata: string scalar porterstem(string scalar word) { if (strlen(word)<3) { return(word) } else { /// m meaures and conditions string scalar mgr0 string scalar mgr1 string scalar hasvowel string scalar cond_o mgr0 = "^([^aeiou][^aeiouy]*)?([aeiouy][aeiou]*)([^aeiou][^aeiouy]*)" mgr1 = "^([^aeiou][^aeiouy]*)?([aeiouy][aeiou]*)([^aeiou][^aeiouy]*)([aeiouy][aeiou]*)([^aeiou][^aeiouy]*)" hasvowel = "([^aeiou][^aeiouy]*)?[aeiouy]" cond_o = "([^aeiou])([aeiouy])([^aeiouwxy])$" /// replace y with Y to avoid any matching issues if (substr(word,1,1)=="y") { word = subinstr(word,"y","Y",1) } else { } /// Porter's step 1a if (regexm(word, "sses$")) { word = regexr(word, "sses$", "ss") } else if (regexm(word, "ies$")) { word = regexr(word, "ies$", "i") } else if (regexm(word, "ss$")) { } else if (regexm(word, "s$") & !regexm(word,"ss$")) { word = strreverse(subinstr(strreverse(word),"s","",1)) /** this may be a bug - regexr won't replace a single character at the end of the string **/ } else { } /// Porter's step 1b if (regexm(word, "eed$")) { if (regexm(regexr(word, "eed$", "ee"),mgr0)) { word = regexr(word, "eed$", "ee") } else { } } else if (regexm(word, "(ed|ing)$") & regexm(regexr(word,"(ed|ing)$", ""),hasvowel)) { word = regexr(word,"(ed|ing)$", "") if (regexm(word, "(at|bl|iz)$")) { word = word + "e" } else if (regexm(word, "[^aeiouylsz]$") & (substr(word,-1,1)==substr(word,-2,1))) { word = substr(word,1,strlen(word)-1) } else if (regexm(word,mgr0) & !regexm(word,mgr1) & regexm(word,cond_o)) { word = word + "e" } else { } } else { } /// Porter's step 1c if (regexm(word, "y$")) { if (regexm(substr(word,1,strlen(word)-1),hasvowel)) { word = substr(word,1,strlen(word)-1) + "i" } else { } } else { } /// Porter's step 2 if (regexm(word,"(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$")) { if (substr(word,strlen(word)-1,1) == "a") { step2list = "ational$","tional$" step2suf = "ate","tion" } else if (substr(word,strlen(word)-1,1) == "c") { step2list = "enci$","anci$" step2suf = "ence","ance" } else if (substr(word,strlen(word)-1,1) == "e") { step2list = "izer$" step2suf = "ize" } else if (substr(word,strlen(word)-1,1) == "l") { step2list = "bli$","alli$","entli$","eli$","ousli$" step2suf = "ble","al","ent","e","ous" } else if (substr(word,strlen(word)-1,1) == "o") { step2list = "ization$","ation$","ator$" step2suf = "ize","ate","ate" } else if (substr(word,strlen(word)-1,1) == "s") { step2list = "alism$","iveness$","fulness$","ousness$" step2suf = "al","ive","ful","ous" } else if (substr(word,strlen(word)-1,1) == "t") { step2list = "aliti$","iviti$","biliti$" step2suf = "al","ive","ble" } else { step2list = "logi$" step2suf = "log" } for (iter=1;iter<=cols(step2list);iter++) { if (regexm(word, step2list[iter])) { if (regexm(regexr(word, step2list[iter], ""),mgr0)) { word = regexr(word,step2list[iter],step2suf[iter]) } else { } break } else { } } } else { } /// Porter's step 3 if (regexm(word,"(icate|ative|alize|iciti|ical|ful|ness)$")) { if (substr(word,strlen(word),1)=="e") { step3list = "icate$","ative$","alize$" step3suf = "ic","","al" } else { step3list = "iciti$","ical$","ful$","ness$" step3suf = "ic","ic","","" } for (iter=1;iter<=cols(step3list);iter++) { if (regexm(word, step3list[iter])) { if (regexm(regexr(word, step3list[iter], ""),mgr0)) { word = regexr(word,step3list[iter],step3suf[iter]) } else { } break } else { } } } else { } /// Porter's step 4 if (regexm(word,"(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ion|ou|ism|ate|iti|ous|ive|ize)$")) { if (substr(word, -1,1)=="e") { step4list = "ance$","ence$","able$","ible$","ate$","ive$","ize$" } else if (substr(word, -1,1)=="t") { step4list = "ement$","ment$","ent$","ant$" } else { step4list = "al$","er$","ic$","ion$","ous$","ou$","ism$","iti$" } for (iter=1;iter<=cols(step4list);iter++) { if (regexm(word, step4list[iter])) { if (step4list[iter] != "ion$") { if (regexm(regexr(word, step4list[iter], ""),mgr1)) { word = regexr(word,step4list[iter],"") } else { } } else { if (regexm(regexr(word, "ion$", ""),mgr1) & (regexm(regexr(word, "ion$", ""),"s$") | regexm(regexr(word, "ion$", ""),"t$"))) { word = regexr(word, "ion$", "") } else { } } break } else { } } } else { } /// Porter's step 5a if (regexm(word, "e$") & regexm(strreverse(subinstr(strreverse(word),"e","",1)),mgr1)) { word = strreverse(subinstr(strreverse(word),"e","",1)) } else if (regexm(word, "e$") & regexm(strreverse(subinstr(strreverse(word),"e","",1)),mgr0) & !regexm(strreverse(subinstr(strreverse(word),"e","",1)),mgr1) & !regexm(strreverse(subinstr(strreverse(word),"e","",1)),cond_o)) { word = strreverse(subinstr(strreverse(word),"e","",1)) } else { } /// Porter's step 5b if (regexm(word,mgr1) & regexm(word,"ll$")) { word = strreverse(subinstr(strreverse(word),"l","",1)) } else { } /// Replace Y with y if (substr(word,1,1)=="Y") { word = subinstr(word,"Y","y",1) } else { } return(word) } } end *! stemcolumn (apply porterstem() to a column vector) 1.0.1 Unisia Williams 08Sep2013 mata: string stemcolumn(string vector txtfield) { real i, j string stemtxtfield, temprow /// loop through txt data stemtxtfield = J(rows(txtfield),1,"") for (i=1;i<=rows(txtfield);i++) { /// separate the tokens in each row and stem the tokens temprow= tokens(txtfield[i]) for (j=1;j<=cols(temprow);j++) { temprow[j] = porterstem(temprow[j]) } /// recombine the stemmed tokens stemtxtfield[i] = invtokens(temprow) } return(stemtxtfield) } end *! wordbag (represent text as bag-of-words) 1.0.1 Unisia Williams 08Sep2013 mata: void wordbag(string txtfield , string prefix, string scalar touse) { string words, sent, words_vnames real scalar i, j, k, matched real matrix wordcounts /// vector to store each unique word words = J(1,1, "") /// matrix to store counts of each unique word wordcounts = J(rows(txtfield),1,0) /// start loop through sentences and tokenize each sentence for (i=1;i<=rows(txtfield);i++) { sent = tokens(txtfield[i]) /// loop through tokens of sentences, set matched to 0 for each new token for (j=1;j<=cols(sent);j++) { matched = 0 /// loop through words vector, set matched to the count if the token matches an existing word for (k=1;k<=cols(words);k++) { if (sent[j]==words[k]) { matched = k break } } /// if match found, increment count of word in wordcounts matrix at position given by matched if (matched>0) { wordcounts[i,matched] = wordcounts[i,matched] + 1 } /// if no match found, add word and set its count to 1 in wordcounts matrix else if (matched==0) { words = words,sent[j] wordcounts = wordcounts, J(rows(wordcounts),1,0) wordcounts[i,cols(wordcounts)] = 1 } } } /// trim off the empty first column of words and wordcounts words = words[.,2::cols(words)] wordcounts = wordcounts[.,2::cols(wordcounts)] /// add thew prefix to all the words, so that they are valid stata variables words_vnames = words for (i=1;i<=cols(words);i++) { words_vnames[i] = prefix + words[i] } /// write vars (words) and data (wordcounts) to data (void) st_addvar("int",words_vnames) st_store(.,words_vnames, touse, wordcounts) } end