/*

chimchar 2.0.1 10 October 2023
Tommy Morgan - labhours@tmorg.org
CHanging IMpractical CHARacters: a Stata command
that cleans string variables of all their annoying characters. Turns
Unicode characters into their closest ASCII counterpart (non-extended)
if they represent a letter and removes them if they don't.

Alternatively, strips numeric string variables of their non-numeric
characters as preparation for a destring. Also can switch commas and
periods in datasets that come with decimal commas.

*/

program define chimchar
	version 14
	
	syntax [varlist] , [NUMOKAY NUMREMOVE NUMONLY] [DPSWITCH]

	foreach strvar in `varlist' {
		
		*check if it's a string, skip if not
		capture confirm string variable `strvar'
		if _rc != 0 {
			di as txt "skipping `strvar' because it is not a string variable"
		}
		else {
		
			*make sure they only picked one option
			if ("`numokay'" != "" & "`numremove'" != "") | ("`numokay'" != "" & "`numonly'" != "") | ("`numonly'" != "" & "`numremove'" != "") {
				di as err "options numonly, numokay, and numremove are mutually exclusive, pick just one"
				exit 198
			}
			
			
			*make sure they picked an option
			if "`numokay'" == "" & "`numremove'" == "" & "`numonly'" == "" {
				di as err "must specify either numokay, numremove, or numonly option"
				exit 198
			}
			
			*do the comma decimal switch if marked
			if "`dpswitch'"!="" {
				di as text "Now switching all commas in `strvar' to periods and vice-versa"
				replace `strvar' = subinstr(`strvar', ".", "+_+yeet+_+", .)
				replace `strvar' = subinstr(`strvar', ",", ".", .)
				replace `strvar' = subinstr(`strvar', "+_+yeet+_+", ",", .)
				di as text "All commas in `strvar' have now been switched to periods and vice-versa"
			}
			
			
			
			
			
			di as text "Now removing uniquely obnoxious characters like ` and () from `strvar'"
			quietly {
				
				*kill the worst, most obnoxious characters
				local grave = char(96)
				local apost = char(39)
				local quote = char(34)
				local leftparenth = char(40)
				local rightparenth = char(41)
				replace `strvar' = subinstr(`strvar', "`macval(grave)'", "", .)
				replace `strvar' = subinstr(`strvar', `"`macval(apost)'"', "", .)
				replace `strvar' = subinstr(`strvar', `"`macval(quote)'"', "", .)
				replace `strvar' = subinstr(`strvar', `"`macval(leftparenth)'"', "", .)
				replace `strvar' = subinstr(`strvar', `"`macval(rightparenth)'"', "", .)
			}
			
			di as text "Uniquely obnoxious characters like ` and () have now been removed from `strvar'"
			di as text "Now replacing special letters like Æ and ĸ with normal letters in `strvar'"
			
			quietly {
				*get all the letters taken care of
				replace `strvar' = subinstr(`strvar', "Æ", "ae", .)
				replace `strvar' = subinstr(`strvar', "Ø", "o", .)
				replace `strvar' = subinstr(`strvar', "Ð", "d", .)
				replace `strvar' = subinstr(`strvar', "ß", "ss", .)
				replace `strvar' = subinstr(`strvar', "æ", "ae", .)
				replace `strvar' = subinstr(`strvar', "ø", "o", .)
				replace `strvar' = subinstr(`strvar', "Ð", "d", .)
				replace `strvar' = subinstr(`strvar', "ı", "i", .)
				replace `strvar' = subinstr(`strvar', "IJ", "ij", .)
				replace `strvar' = subinstr(`strvar', "ij", "ij", .)
				replace `strvar' = subinstr(`strvar', "ĸ", "k", .)
				replace `strvar' = subinstr(`strvar', "ʼn", "n", .)
				replace `strvar' = subinstr(`strvar', "ÅŠ", "ng", .)
				replace `strvar' = subinstr(`strvar', "Å‹", "ng", .)
				replace `strvar' = subinstr(`strvar', "Å’", "oe", .)
				replace `strvar' = subinstr(`strvar', "Å“", "oe", .)
				replace `strvar' = ustrlower(ustrto(ustrnormalize(`strvar', "nfd"), "ascii", 2))
			}
			
			di as text "Special letters like Æ and ĸ have now been replaced with normal letters in `strvar'"
				
				
				*numokay: kill the extra ASCII characters but keep the numbers
				if "`numokay'"!="" & "`numremove'"=="" & "`numonly'"=="" {
					di as text "Now removing all remaining non-numeric and non-letter ASCII characters from `strvar'"
					qui foreach i of numlist 1/255 {
						if !inrange(`i', 48, 57) & !inrange(`i', 44, 46) & !inrange(`i', 97, 122) & `i'!=96 & `i'!=39 & `i'!=34 & `i'!=40 & `i'!=41 {
								replace `strvar' = subinstr(`strvar', `"`=char(`i')'"', "", .)
						}
					}
					di as text "All remaining non-numeric and non-letter ASCII characters have been removed from `strvar'"
				}
				
				
				*numremove: kill all remaining non-letter ASCII characters
				if "`numokay'"=="" & "`numremove'"!="" & "`numonly'"=="" {
					di as text "Now removing all remaining non-letter ASCII characters from `strvar'"
					qui foreach i of numlist 1/255 {
						if !inrange(`i', 97, 122) & `i'!=96 & `i'!=39 & `i'!=34 & `i'!=40 & `i'!=41 {
								replace `strvar' = subinstr(`strvar', `"`=char(`i')'"', "", .)
						}
					}
					di as text "All remaining non-letter ASCII characters have been removed from `strvar'"
				}
				
				*numonly: kill everything but the numeric characters along with . by itself
				if "`numokay'"=="" & "`numremove'"=="" & "`numonly'"!="" {
					di as text "Now removing all remaining non-numeric characters from `strvar' and isolated decimal markers"
					qui foreach i of numlist 1/255 {
						if !inrange(`i', 48, 57) & !inrange(`i', 45, 46) & `i'!=96 & `i'!=39 & `i'!=34 & `i'!=40 & `i'!=41 {
								replace `strvar' = subinstr(`strvar', `"`=char(`i')'"', "", .)
						}
					}
					qui foreach i of numlist 1/5 {
						replace `strvar' = subinstr(`strvar', "..", ".",.)
						replace `strvar' = subinstr(`strvar', "--", "-",.)
					}
					di as text "All remaining non-numeric characters have been removed from `strvar'"
				}
				
			di as result "`strvar' is clean now!"
		}
	}
		
end