*! version 1.0.0 Matthew White 10sep2013
vers 11

loc RS	real scalar
loc RR	real rowvector
loc RC	real colvector
loc SS	string scalar
loc SR	string rowvector
loc SC	string colvector

// "ocq" for "open compound quote"
loc ocq		("`" + `"""')
// "ccq" for "close compound quote"
loc ccq		(`"""' + "'")

/* Comments below refer to "problematically unbalanced open quotes." An
unbalanced open quote is problematic if it is an open compound double quote or
if it is a left single quote followed by ` or the end of the string.

Examples:

`' and `""' are balanced quotes.

`x contains an unbalanced left single quote, but it is unproblematic, because it
is not followed by ` or the end of the string.

`"x, x`, and ``x all contain problematically unbalanced open quotes. */

mata:

/* -specialexp_other()- splits a string into separate strings when needed to
handle the ASCII characters 1-31, 127, and 255. For example, the string X\rY,
where \r is a carriage return, needs to be split into "X" + char(13) + "Y":
"X\rY" is not a valid expression.

-specialexp_other()- returns the split string as the rowvector els (for
"elements") along with a rowvector named literal that is 1 if the corresponding
element of els is a string literal and 0 otherwise. */
void function specialexp_other(`SS' s, `SR' els, `RR' literal)
{
	`RS' len, i
	`RR' ascii
	`SR' chars
	transmorphic t

	ascii = 1..31, 127, 255
	len = length(ascii)
	chars = J(1, len, "")
	for (i = 1; i <= len; i++) {
		chars[i] = char(ascii[i])
	}

	t = tokeninit("", chars, "")
	tokenset(t, s)
	els = tokengetall(t)

	literal = J(1, length(els), 1)
	for (i = 1; i <= len; i++) {
		literal = literal :& els :!= chars[i]
		els = subinstr(els, chars[i], sprintf("char(%f)", ascii[i]), .)
	}
}

// Split a string into separate strings when needed to correctly enclose a \.
// For example, the string \`x' needs to be split into "\" + "\`x'": "\`x'"
// results in `x', and "\\`x'" would expand the local macro.
`SR' function specialexp_escape(`SS' s)
{
	`RS' len, pos
	`SS' rest
	// "els for "elements"
	`SR' els

	// els will store the separate strings into which s will be split.
	els = ""
	// len (for "length") is the number of elements of els.
	len = 1
	rest = s
	while (rest != "") {
		pos = strpos(rest, "\")
		if (!pos) {
			els[len] = els[len] + rest
			rest = ""
		}
		else {
			els[len] = els[len] + substr(rest, 1, pos)
			rest = substr(rest, pos + 1, .)

			// If \ is followed by ` or $, start a new element.
			if (anyof(("`", "$"), substr(rest, 1, 1))) {
				els = els, ""
				len++
			}
		}
	}

	return(els)
}

/* -specialexp_balanced_quotes()- expects a colvector s that was tokenized from
a scalar x as follows:

t = tokeninit("", ("`", "'", `ocq', `ccq'), "")
tokenset(t, x)
s = tokengetall(t)'

-specialexp_balanced_quotes()- returns 1 if the concatenation of s (i.e., x)
does not contain a problematically unbalanced open quote and does not contain an
unbalanced close compound double quote; it returns 0 otherwise. */
// "probq" for "problematic quote"
`RS' function specialexp_balanced_quotes(`SC' s, |`RS' quotes, `RC' probq)
{
	// "inprob" for "inside problematic (quote)"
	`RS' rows, inside, inprob, row, i
	// "prob" for "problematic"; "isq" for "is quote"; "lsq" for "left single
	// quotes"; "ocq" for "open compound quotes."
	`RC' prob, isq, lsq, ocq
	`SS' type
	pointer(`RC') scalar pprob
	// "q" for "quotes"
	pointer(`SC') scalar q

	// If the quotes argument is specified, probq needs to be.
	assert(args() != 2)

	if (args() >= 2 & quotes) {
		q = &s
		pprob = &probq
	}
	else {
		rows = rows(s)
		prob = s[rows] == "`"
		if (rows > 1) {
			prob =
				s[|1 \ rows - 1|] :== "`" :&
				(s[|2 \ .|] :== "`" :|
				substr(s[|2 \ .|], 1, 1) :== `"""') \
				prob
		}
		prob = prob :| s :== `ocq'
		isq = s :== "`" :| s :== "'" :| s :== `ocq' :| s :== `ccq'
		q     = &select(s,    isq)
		pprob = &select(prob, isq)
	}
	rows = rows(*q)

	if (!rows)
		return(1)
	else if ((*q)[1] == "`" | (*q)[1] == `ocq') {
		lsq = ocq = inside = inprob = row = 0
		type = ""
		for (i = 1; i <= rows; i++) {
			lsq = lsq + ((*q)[i] == "`")   - ((*q)[i] == "'")
			ocq = ocq + ((*q)[i] == `ocq') - ((*q)[i] == `ccq')
			if (!inside) {
				if (lsq == 1 & (*q)[i] == "`" | ocq == 1 & (*q)[i] == `ocq') {
					type = (*q)[i]
					inside = 1
					inprob = (*pprob)[i]
					row = i
				}
				else if ((*q)[i] == `ccq')
					return(0)
			}
			else if (!lsq & type == "`" | !ocq & type == `ocq') {
				lsq = ocq = inside = inprob = 0
				if (i != row + 1) {
					if (!specialexp_balanced_quotes((*q)[|row + 1 \ i - 1|], 1,
						(*pprob)[|row + 1 \ i - 1|])) {
						return(0)
					}
				}
			}
		}

		if (!inside)
			return(1)
		else {
			// inprob = 1 if we're still inside a potentially problematic open
			// quote, for example, an open compound double quote.
			if (inprob)
				return(0)
			// !inprob & row == rows for a left single quote at the end of a
			// substring in the middle of the larger string. For example, `"`x"'
			// would result in a call of specialexp_balanced_quotes() with
			// s = "`" but probq = 0, in which case !inprob & row == rows.
			else if (row == rows)
				return(1)
			else {
				return(specialexp_balanced_quotes((*q)[|row + 1 \ .|], 1,
					(*pprob)[|row + 1 \ .|]))
			}
		}
		/*NOTREACHED*/
	}
	else if ((*q)[1] == "'") {
		if (rows == 1)
			return(1)
		else
			return(specialexp_balanced_quotes((*q)[|2 \ .|], 1, (*pprob)[|2 \ .|]))
		/*NOTREACHED*/
	}
	// Unbalanced close compound double quote
	else
		return(0)
	/*NOTREACHED*/
}

/* -specialexp_max_balanced()- expects a colvector s that was tokenized from a
scalar x as follows:

t = tokeninit("", ("`", "'", `ocq', `ccq'), "")
tokenset(t, x)
s = tokengetall(t)'

Further, it expects that s[1] is a left single quote or open compound double
quote.

-specialexp_max_balanced()- returns the maximum row such that the string formed
by concatenating the elements up to and including the row is enclosed by single
or compound double quotes and does not contain a problematically unbalanced open
quote. */
`RS' function specialexp_max_balanced(`SC' s)
{
	`RS' i
	// "balrows" for "balanced rows"
	`RC' balrows

	assert(s[1] == "`" | s[1] == `ocq')

	balrows = select(1::rows(s),
		!runningsum((s :== `ocq') - (s :== `ccq')) :&
		s :== (s[1] == "`" ? "'" : `ccq')
	)
	for (i = length(balrows); i >= 1; i--) {
		if (specialexp_balanced_quotes(s[|1 \ balrows[i]|]))
			return(balrows[i])
	}
	return(0)
}

// Split a string into separate strings when needed to correctly enclose a
// single or double quote. For example, the string `"' needs to be split into
// "`" + `"""' + "'".
// "unballsq" for "(problematically) unbalanced left single quote"
`SR' function specialexp_quotes(`SS' s, `RR' unballsq)
{
	// "dq" for "double quote"; "plen" for "piece length"; "prob" for
	// "problematic."
	`RS' len, dq, pos, plen, max, strdq, prob, i
	// "traillsq" for "trailing left single quote"
	`RR' traillsq
	// "c" for "character"
	`SS' rest, c, str
	// "els" for "elements"
	`SR' els
	// "trest" for "tokenized rest"
	`SC' trest
	transmorphic tinit, t

	// els will store the separate strings into which s will be split.
	els = ""
	// For each element of els, the corresponding element of unballsq is 1 if
	// the element contains a problematically unbalanced left single quote and
	// 0 otherwise.
	unballsq = 0
	// len (for "length") is the number of elements of els.
	len = 1
	// dq is 1 if the current element contains a double quote and 0 otherwise.
	dq = 0
	tinit = tokeninit("", ("`", "'", `ocq', `ccq'), "")
	rest = s
	while (rest != "") {
		pos = strpos(rest, "`") \ strpos(rest, "'") \ strpos(rest, `"""')
		if (!any(pos)) {
			els[len] = els[len] + rest
			rest = ""
		}
		else {
			pos = min(select(pos, pos))
			els[len] = els[len] + substr(rest, 1, pos - 1)
			rest = substr(rest, pos, .)
			c = substr(rest, 1, 1)

			// Unbalanced right single quote
			if (c == "'") {
				// If the right single quote would follow a ", start a new
				// element.
				if (substr(els[len], -1, 1) == `"""') {
					els = els, "'"
					unballsq = unballsq, 0
					len++
					dq = 0
				}
				else
					els[len] = els[len] + "'"
				plen = 1
			}
			// Simple double quote or unbalanced close compound double quote
			else if (c == `"""') {
				// If the element contains a problematically unbalanced left
				// single quote, start a new element.
				if (unballsq[len]) {
					els = els, `"""'
					unballsq = unballsq, 0
					len++
				}
				else
					els[len] = els[len] + `"""'
				dq = 1
				plen = 1

				// If it is an unbalanced close compound double quote, the
				// -c == "'"- case above will address the '.
			}
			// Left single quote or open compound double quote
			else {
				tokenset(t = tinit, rest)
				trest = tokengetall(t)'
				// If the quote is balanced...
				if (max = specialexp_max_balanced(trest)) {
					plen = sum(strlen(trest[|1 \ max|]))

					// Precede left single quotes by the \ escape character.
					trest = (trest :== "`") :* "\" :+ trest

					str = ""
					for (i = 1; i <= max; i++) {
						str = str + trest[i]
					}

					// If the element contains a problematically unbalanced left
					// single quote and str contains a double quote, start a new
					// element.
					strdq = strpos(str, `"""')
					if (unballsq[len] & strdq) {
						els = els, str
						unballsq = unballsq, 0
						len++
						dq = strdq
					}
					else {
						els[len] = els[len] + str
						dq = dq | strdq
					}
				}
				// If the quote is unbalanced...
				else {
					// If the quote is problematically unbalanced and the
					// element contains a double quote, start a new element.
					prob = anyof(("`", `"""', ""), substr(rest, 2, 1))
					if (dq & prob) {
						els = els, "\\`"
						unballsq = unballsq, 0
						len++
						dq = 0
					}
					else
						els[len] = els[len] + "\\`"
					unballsq[len] = unballsq[len] | prob
					plen = 1

					// If it is an unbalanced open compound double quote, the
					// -c == `"""'- case above will address the ".
				}
			}

			rest = substr(rest, plen + 1, .)
		}
	}

	/* Left single quotes need to be preceded by the \ escape character unless
	they are at the end of a string. Within a single string of the
	-specialexp()- expression, the \ is unnecessary for unbalanced left single
	quotes: "`x" is the same as "\`x". However, it is necessary across strings:
	for `\\', -specialexp()- should return "\`\" + "\'", not "`\" + "\'". The
	exception to this is a left single quote at the end of a string: for `"',
	"`" + `"""' + "'" is the same as "\`" + `"""' + "'". Similar rules hold for
	$. */
	traillsq = substr(els, -2, .) :== "\\`"
	els = !traillsq :* els + traillsq :* (substr(els, 1, strlen(els) :- 2) :+ "`")

	return(els)
}

`SS' function specialexp(`SS' s, |`RS' nstrs)
{
	// "els" suffix for "elements": "nels" for "number of elements."
	`RS' nels, i
	// "escliteral" for "escape literal";
	// "unballsq" for "(problematically) unbalanced left single quote";
	// "prob" for "problematic."
	`RR' otherliteral, escliteral, literal, unballsq, prob, match
	`SR' otherels, escels, split, els

	if (strpos(s, char(0))) {
		errprintf("s cannot contain binary 0\n")
		exit(198)
		/*NOTREACHED*/
	}

	if (s == "") {
		nstrs = 1
		return(`""""')
	}

	// ASCII characters 1-31, 127, and 255
	pragma unset otherels
	pragma unset otherliteral
	specialexp_other(s, otherels, otherliteral)

	// \ escape character
	escels = J(1, 0, "")
	escliteral = J(1, 0, .)
	nels = length(otherels)
	for (i = 1; i <= nels; i++) {
		if (otherliteral[i]) {
			split = specialexp_escape(otherels[i])
			escels = escels, split
			escliteral = escliteral, J(1, length(split), 1)
		}
		else {
			escels = escels, otherels[i]
			escliteral = escliteral, 0
		}
	}

	// Single and double quotes
	els = J(1, 0, "")
	literal = unballsq = J(1, 0, .)
	nels = length(escels)
	pragma unset prob
	for (i = 1; i <= nels; i++) {
		if (escliteral[i]) {
			split = specialexp_quotes(escels[i], prob)
			els = els, split
			literal = literal, J(1, length(split), 1)
			unballsq = unballsq, prob
		}
		else {
			els = els, escels[i]
			literal = literal, 0
			unballsq = unballsq, 0
		}
	}

	// Precede $ by the escape character \ unless it is at the end of an
	// element. (See -specialexp_quotes()- for an explanation of when left
	// single quotes and $ need the \.)
	els = subinstr(els, "$", "\\$", .)
	match = substr(els, -2, .) :== "\\$"
	els = !match :* els + match :* (substr(els, 1, strlen(els) :- 2) :+ "$")

	/* If an element ends in the escape character \ and (1) the next element is
	a left single quote and the current element does not contain a double quote,
	or (2) the next element is $, or (3) the next element starts with an open
	compound double quote and the current element does not contain a
	problematically unbalanced left single quote, add an extra \ to the current
	element and concatenate it with the next one. */
	nels = length(els)
	for (i = 1; i <= nels - 1; i++) {
		if (substr(els[i], -1, 1) == "\" &
			(els[i + 1] == "`" & !strpos(els[i], `"""') |
			els[i + 1] == "$" |
			substr(els[i + 1], 1, 2) == `ocq' & !unballsq[i])
		) {
			els[i] = els[i] + "\" + els[i + 1]
			els[i + 1] = ""
			i++
		}
	}
	literal = select(literal, els :!= "")
	els = select(els, els :!= "")

	// Adorn els in double quotes.
	match = strpos(els, `"""') :!= 0
	els = literal :* (match :* "`" :+ `"""') :+
		els :+
		literal :* (`"""' :+ match :* "'")

	nstrs = length(els)
	return(invtokens(els, " + "))
}

end