# version 1.02  06apr2023  DIME Analytics dimeanalytics@worldbank.org
# Import packages ============
import os
import re
import sys
import stata_linter_detect as sld

# Version Global
## VERY IMPORTANT: Update the version number here every time there's an update
## in the package. Otherwise this will cause a major bug
VERSION = "1.02"

# Function to update comment delimiter =============
# (detection works only when comment delimiter == 0)
def update_comment_delimiter(comment_delimiter, line):
    '''
    This function detects if a line is opening a comment section
    in a Stata dofile. Comment sections are delimited by the
    charaters "/*" and "*/"
    '''
    # if "/*" and "*/" are in the same line, never mind
    if re.search(r"\/\*.*\*\/", line):
        comment_delimiter += 0
    # if "/*" (opening) detected, add 1
    elif re.search(r"\/\*", line):
        comment_delimiter += 1
    # if "*/" (closing) detected, subtract 1
    elif (re.search(r"\*\/", line) != None) & (comment_delimiter > 0):
        comment_delimiter -= 1
    return(comment_delimiter)

# Functions for auto-correction ===================

# Convert delimit to three forward slashes -------------------
def delimit_to_three_forward_slashes(input_file, output_file, indent, tab_space, linemax):
    output_list = []
    with open(input_file, "r") as reader:
        input_lines = reader.readlines()
        delimit_on = 0
        comment_delimiter = 0
        for line_index, line in enumerate(input_lines):
            # update comment_delimiter
            comment_delimiter = update_comment_delimiter(comment_delimiter, line)
            if comment_delimiter > 0:
                output_list.append(line)
            elif comment_delimiter == 0:
                # check if "#delimit (something other than cr)" is included in a line
                if re.search(r"^#delimit(?! cr)", line.lstrip()):
                    delimit_on = 1
                    # store the character used for line breaks (ignoring comments)
                    # (if not specified, default is ";")
                    line_split = re.split(r"//", line)[0].strip().split(" ")
                    if len(line_split) > 1:
                      delimit_symbol = line_split[1]
                    else:
                      delimit_symbol = ";"
                # check if "#delimit cr" appears in a line, which means
                # the end of delimit function
                elif re.search(r"^#delimit cr", line.lstrip()):
                    delimit_on = 0
                # for other lines, if delimit_on = 0, then just use the line, and
                # if delimit_on = 1, then add "///" at the end of line but before
                # any comments
                else:
                    if delimit_on == 0:
                        output_list.append(line)
                    elif delimit_on == 1:
                        # get any non-comment part of the line and
                        # strip any redundant whitespaces at the end
                        line_split_for_comment = re.split(r"//", line)
                        line_main = line_split_for_comment[0]
                        if len(line_split_for_comment) > 1:
                            line_comment = line_split_for_comment[1]
                        line_main_rstrip = line_main.rstrip()
                        # if the line is not blank, add appropriate line break commands (///)
                        if len(line_main_rstrip) > 0:
                            # if the line does not end with the delimit symbol (such as ";"),
                            # then that means the command continues to the next line,
                            # so add a line break
                            if line_main_rstrip[-1] != delimit_symbol:
                                output_line = line_main_rstrip + " ///"
                            # if the line does end with the delimit symbol, then
                            # just remove the last symbol in the line
                            elif line_main_rstrip[-1] == delimit_symbol:
                                output_line = line_main_rstrip[:-1]

                            # replace all the remaining delimit symbols to "\n"
                            output_line = re.sub(delimit_symbol, "\n", output_line)

                            # if there is any comment in the line, then
                            # just append the comment
                            if len(line_split_for_comment) > 1:
                                output_line = output_line + " //" + line_comment
                            # if there is no comment in the line, then
                            # just add a newline command (\n) at the end
                            elif len(line_split_for_comment) == 1:
                                output_line = output_line + " \n"

                            output_list.append(output_line)

                        # if the line is blank, just append the blank line
                        elif len(line_main_rstrip) == 0:
                            output_list.append(line)

    with open(output_file, "w") as writer:
        for output_line in output_list:
            writer.write(output_line)


# Convert hard tabs to soft tabs (= whitespaces) ----------------------
def tab_to_space(input_file, output_file, indent, tab_space, linemax):
    output_list = []
    with open(input_file, "r") as reader:
        input_lines = reader.readlines()
        comment_delimiter = 0
        for line_index, line in enumerate(input_lines):
            # replace the hard tabs detected in a line to soft tabs (whitespaces)
            spaces = ' ' * int(tab_space)
            pattern = r'^( *)(\t+)([^\t].*\n{0,1})'
            match = re.match(pattern, line)
            if match:
                output_list.append(match.group(1) +
                    match.group(2).replace('\t', spaces) +
                    match.group(3))
            else:
                output_list.append(line)
    with open(output_file, "w") as writer:
        for output_line in output_list:
            writer.write(output_line)

# Use indents in brackets after for and while loops or if/else conditions --------------------
def indent_in_bracket(input_file, output_file, indent, tab_space, linemax):
    with open(input_file, "r") as reader:
        input_lines = reader.readlines()
        loop_start = []
        bracket_start = []
        bracket_pair = []
        nest_level = 0
        max_nest_level = 0
        comment_delimiter = 0
        for line_index, line in enumerate(input_lines):
            # update comment_delimiter
            comment_delimiter = update_comment_delimiter(comment_delimiter, line)
            if comment_delimiter == 0:
                # get the main command of the line (ignoring comments at the end) and remove
                # redundant whitespaces
                line_rstrip = re.sub(r"(\/\/)|(\/\*).*", r"", line).rstrip()
                # if the line is not blank or has any command other than comments,
                # do the followings
                if len(line_rstrip) > 0:
                    # check if the line starts with commands that potentially have curly brackets
                    # (but ignore if this line is the continuation from the previous line,
                    # because then the expression here should not have curly brackets)
                    if (
                        (re.search(r"^(qui[a-z]*\s+)?(foreach |while |forv|if |else |cap)", line.lstrip()) != None) &
                        (re.search(r"\/\/\/", input_lines[max(line_index - 1, 0)]) == None)
                        ):
                        # if the line ends with an open curly bracket,
                        # then tag it (here the depth of the nests are stored as well)
                        if line_rstrip[-1] == "{":
                            loop_start.append(line_index)
                            bracket_start.append(line_index)
                            nest_level += 1
                            max_nest_level = max(max_nest_level, nest_level)
                        # if the line does not end with an open curly bracket but includes line breaks,
                        # then search for the line including the open curly bracket in the following lines
                        # and tag the line
                        elif (line_rstrip[-1] != "{") & (re.search(r"\/\/\/", line) != None):
                            loop_start.append(line_index)
                            for i in range(line_index, len(input_lines)):
                                temp_line_rstrip = re.sub(r"\/\/.*", r"", input_lines[i]).rstrip()
                                if temp_line_rstrip[-1] == "{":
                                    bracket_start.append(i)
                                    break
                            nest_level += 1
                            max_nest_level = max(max_nest_level, nest_level)
                    # check if the line ends with a closing curly bracket
                    # (ignore it if that is not used for global macro)
                    if (line_rstrip[-1] == "}") & (not re.search(r"\$.?{", line)):
                        bracket_pair.append([loop_start.pop(), line_index, nest_level, bracket_start.pop()])
                        nest_level -= 1
        # for each depth of nests, add appropriate indentations
        for nest_level in range(1, max_nest_level + 1):
            for pair in bracket_pair:
                if pair[2] == nest_level:
                    # get the position of where to start indentations
                    start_indent = len(input_lines[pair[0]]) - len(input_lines[pair[0]].lstrip())
                    # for each line in the nest, do the followings
                    for j in range(pair[0] + 1, pair[1]):
                        # if the line is blank, ignore it
                        if len(input_lines[j].lstrip()) == 0:
                            pass
                        # if the line is not blank, then add indentations at the beginning of the line
                        elif len(input_lines[j].lstrip()) > 0:
                            input_lines[j] = " " * (start_indent + int(indent)) + (input_lines[j].lstrip())
    with open(output_file, "w") as writer:
        for output_line in input_lines:
            writer.write(output_line)

# Split too long line (> linemax characters) to multiple lines
# (but do not break strings in double quotes (""), parentheses, or curly brackets) --------------------
def too_long_line(input_file, output_file, indent, tab_space, linemax):
    output_list = []
    with open(input_file, "r") as reader:
        input_lines = reader.readlines()
        newline_flag = 0
        comment_delimiter = 0
        for line_index, line in enumerate(input_lines):
            # update comment_delimiter
            comment_delimiter = update_comment_delimiter(comment_delimiter, line)
            if comment_delimiter > 0:
                output_list.append(line)
            elif comment_delimiter == 0:
                # do nothing if any of the following conditions are met
                if (
                    (len(line) <= int(linemax)) | # the line is not too long, or
                    ((line.lstrip() + " ")[0] == "*") | # the line is a comment
                    ((line.lstrip() + "  ")[:2] == "//") # line contains a comment
                    ):
                    output_list.append(line)
                # otherwise, do the followings
                else:
                    # separate the comment part and the command part of the line
                    line_split_for_comment = re.split(r"//", line)
                    line_main = line_split_for_comment[0]
                    if "\n" in line_main:
                        line_main = line_main.rstrip() + "\n"
                    else:
                        line_main = line_main.rstrip()
                    if len(line_split_for_comment) > 1:
                        line_comment = line_split_for_comment[1]
                    line_indent = (
                        len(line_main.rstrip()) -
                        len(line_main.rstrip().expandtabs(int(indent)).lstrip())
                        )

                    i = 0
                    break_line = []
                    potential_break_line = []
                    double_quote_count = 0
                    parenthesis_count = 0
                    curly_count = 0
                    # looking at each character of a line, tag where to break the line
                    for j, c in enumerate(line_main.lstrip()):

                        position = j + len(line_main) - len(line_main.lstrip())

                        if c == '''"''':
                            double_quote_count = 1 - double_quote_count
                        elif c == "(":
                            parenthesis_count += 1
                        elif c == ")":
                            parenthesis_count -= 1
                        elif c == "{":
                            curly_count += 1
                        elif c == "}":
                            curly_count -= 1

                        # We check "potential" break lines first
                        if ((c == "," or c == " ") and # break line at "," or " "
                            (double_quote_count == 0) and # ignore if in double quotes
                            (parenthesis_count == 0) and # ignore if in parentheses
                            (curly_count == 0)# ignore if in curly brackets
                            ):

                            if c == " ":

                                position2 = line_indent + i + 4
                                potential_break_line.append(position)

                                # If the soon-to-be new line is equal to the linemax,
                                # we add the last potential line break position
                                if position2 >= int(linemax):
                                    break_line.append(potential_break_line[-1])
                                    i = int(indent) + position - potential_break_line[-1]
                                else:
                                    i += 1

                            elif c == ",":

                                position2 = line_indent + i + 5

                                # If the soon-to-be new line is equal to the linemax,
                                # we add the last potential line break position
                                if position2 >= int(linemax):
                                    break_line.append(potential_break_line[-1])
                                    i = int(indent) + position - potential_break_line[-1]
                                else:
                                    i += 1

                                potential_break_line.append(position + 1)

                        else:

                            position2 = line_indent + i + 4
                            if position2 >= int(linemax):
                                break_line.append(potential_break_line[-1])
                                i = int(indent) + position - potential_break_line[-1]
                            else:
                                i += 1

                    # break lines
                    line_split = []
                    break_line_index = [0]
                    break_line_index.extend(break_line)
                    break_line_index.append(len(line_main))
                    for k in range(len(break_line_index) - 1):
                        # if no line break is needed, just append the line
                        if (break_line_index == 2):
                            line_split.append(
                                line_main[break_line_index[k]:break_line_index[k + 1]].rstrip()
                                )
                        # otherwise, break the line according to the positions of characters tagged above
                        else:
                            line_split.append(line_main[break_line_index[k]:break_line_index[k + 1]])

                    # if no line break is needed, then just append the line
                    # with appropriate indentations (and commends if needed)
                    if len(line_split) == 1:
                        if len(line_split_for_comment) > 1:
                            output_list.append(
                                " " * line_indent + line_split[0].lstrip() + " //" + line_comment
                                )
                        elif len(line_split_for_comment) == 1:
                            output_list.append(" " * line_indent + line_split[0].lstrip() + "\n")
                    # otherwise, break the line
                    elif len(line_split) > 1:
                        for i, temp_line in enumerate(line_split):
                            # the first line
                            if i == 0:
                                new_line = " " * line_indent + temp_line.lstrip() + " ///\n"
                            # from the second to the last to the second line
                            elif (i > 0) & (i < len(line_split) - 1):
                                # if the previous line does not include a line break, then
                                # add an appropriate indentations
                                if newline_flag == 0:
                                    new_line = " " * (line_indent + int(indent)) + temp_line.lstrip() + " ///\n"
                                # if the previous line does include a line break, then
                                # assuming that the indentation is correctly done,
                                # add no indentations
                                elif newline_flag == 1:
                                    new_line = " " * (line_indent) + temp_line.lstrip() + " ///\n"
                            # the last line
                            elif (i == len(line_split) - 1):
                                # if the previous line does not include a line break, then
                                # add an appropriate indentations
                                if newline_flag == 0:
                                    new_line = " " * (line_indent + int(indent)) + temp_line.lstrip()
                                # if the previous line does include a line break, then
                                # assuming that the indentation is correctly done,
                                # add no indentations
                                elif newline_flag == 1:
                                    new_line = " " * (line_indent) + temp_line.lstrip()
                                # if there is any comment in the original line, add it at the end
                                if len(line_split_for_comment) > 1:
                                    new_line = new_line + " //" + line_comment
                            output_list.append(new_line)
                # flag if the line includes a line break, which will be used
                # in the next line
                if "///" in line:
                    newline_flag = 1
                else:
                    newline_flag = 0
    with open(output_file, "w") as writer:
        for output_line in output_list:
            writer.write(output_line)

# Add a white space before a curly bracket
# (but not if the curly bracket is used for global macro, as in "${}") --------------------
def space_before_curly(input_file, output_file, indent, tab_space, linemax):
    output_list = []
    with open(input_file, "r") as reader:
        input_lines = reader.readlines()
        comment_delimiter = 0
        for line_index, line in enumerate(input_lines):
            # update comment_delimiter
            comment_delimiter = update_comment_delimiter(comment_delimiter, line)
            if comment_delimiter > 0:
                output_list.append(line)
            elif comment_delimiter == 0:
                # replace "{" with " {" if there is no whitespace
                # before an open curly bracket, but ignore if
                # "${" since this is for global macro
                output_list.append(re.sub(r"([^ $]){", r"\1 {", line))
    with open(output_file, "w") as writer:
        for output_line in output_list:
            writer.write(output_line)

# Remove blank lines before curly brackets are closed --------------------
def remove_blank_lines_before_curly_close(input_file, output_file, indent, tab_space, linemax):
    output_list = []
    with open(input_file, "r") as reader:
        input_lines = reader.readlines()
        comment_delimiter = 0
        for line_index, line in enumerate(input_lines):
            # update comment_delimiter
            comment_delimiter = update_comment_delimiter(comment_delimiter, line)
            if comment_delimiter > 0:
                output_list.append(line)
            elif comment_delimiter == 0:
                if len(line.strip()) == 0:
                    for i in range(line_index + 1, len(input_lines)):
                        if len(input_lines[i].strip()) == 0:
                            pass
                        elif len(input_lines[i].strip()) > 0:
                            line_rstrip = " " + re.sub(r"//.*", r"", input_lines[i]).rstrip()
                            if (line_rstrip[-1] == "}") & (not re.search(r"\$.*{", input_lines[i])):
                                break
                            else:
                                output_list.append(line)
                                break
                elif len(line.strip()) > 0:
                    output_list.append(line)
    with open(output_file, "w") as writer:
        for output_line in output_list:
            writer.write(output_line)


# Remove duplicated blank lines --------------------
def remove_duplicated_blank_lines(input_file, output_file, indent, tab_space, linemax):
    output_list = []
    with open(input_file, "r") as reader:
        input_lines = reader.readlines()
        comment_delimiter = 0
        for line_index, line in enumerate(input_lines):
            # update comment_delimiter
            comment_delimiter = update_comment_delimiter(comment_delimiter, line)
            if comment_delimiter > 0:
                output_list.append(line)
            elif comment_delimiter == 0:
                if sld.detect_duplicated_blank_line(line_index, line, input_lines):
                    pass
                else:
                    output_list.append(line)
    with open(output_file, "w") as writer:
        for i, output_line in enumerate(output_list):
            writer.write(output_line)