capture program drop nmf
*! version 1.00 16 Sep 2022
program define nmf, rclass
    version 17
 
    syntax varlist(numeric), k(integer) [ epoch(integer 200) initial(string) loss(string) stop(numlist max = 1) nograph ]
    
    // Written by Dr Jonathan Batty (J.Batty@leeds.ac.uk),
    // Leeds Institute for Data Analytics (LIDA),
    // University of Leeds.
    //
    // Aim:
    // The aim of NMF is to find W (u x k) and H (k x v) such that A ~ WH, where all 3 matrices contain only non-negative values
    // A good appoximation may be achieved with k << rank(A) NMF is a NP-hard problem; the best that can be done is to find local
    // minima from a set of initialisations.
    //
    // <Inputs>:
    //      varlist: variables that contain columns of data to factorise. All rows of these columns will enter into the factorisation.
    //      
    //      k(): the rank of the factorisation, whereby k must be less than the rank of the number of columns input in varlist.
    //
    //      epoch(): the number of iterations of NMF to repeat to achieve convergence. Dafault = 200.
    //
    //      initial(): initialisation options, specified using the init() parameter, include:
    //          randomu [*default] - random, uniform initialisation of matrix in range [1, 2] 
    //          randomn - random, normally-distributed initialisation of matrix with mean 2 and standard deviation 1
    //          nndsvd - Nonnegative Double Singular Value Decomposition [Boutsidis2007] - suited to sparse factors
    //          nndsvda - NNSVD with zero elements replaced with the input matrix average (not recommended for sparse data)
    //          nndsvdar - NNSVD with zero elements replaced with a random value (nor recommended for sparse data)
    //      Using the random option, multiple runs will be required to identify the factorization that achieves the lowest approximation error. Other options are deterministic and only need to be ran once.
    //      Note: The multiplicative update ('mu') solver cannot update zeros present in the initialization, and so leads to poorer results when used jointly with nndsvd. Co-ordinate descent mitigates this
    //
    //      loss(): divergence function options, specified using the loss() parameter, include:
    //          eu [*default] - Frobenius (Euclidean) norm (equivalent to MSE for matrices)
    //          is - Itakura-Saito divergence 
    //          kl - Generalized Kullback-Leibler divergence
    //
    //      stop(): early stopping options, specified using the stop() parameter, include:
    //          0 - early stopping off; NMF continues for numer of iterations specified in epoch
    //          float value, e.g. 1.0e-4 [*default] - if ((previous error - current error) / error at initiation) < stop tolerance) then further iterations are not performed
    //
    //      nograph suppresses production of the graph of loss divergence at each iteration
    //
    // <Outputs>
    //      W - factor (basis) matrix W
    //      H - factor (encoding or coefficient) matrix H 
    //

    // If a value for initialisation method is not passed, default option is is random initialisation 
    if "`initial'" == "" local initial "randomu"

    // If a value for updating method is not passed, default option is is multiplicative updating (mu) 
    if "`method'" == "" local method "mu"

    // If a value specifying the form of loss divergence normalisation is not passed, default option is is Frobenius normalisation (2) 
    if "`loss'" == "" local loss = "eu"

    // If a value specifying the stopping condition is not passed, default option is is 1.0e-4 (i.e. 0.01%)
    if "`stop'" == "" local stop = 1.0e-4

    // Run NMF
    mata: nmf("`varlist'", `k', `epoch', "`initial'", "`method'", "`loss'", `stop')

    // Store results of NMF using stata matrices
    matrix W = r(W)
    matrix H = r(H)
    matrix error = r(norms)

    // Creates frames containing output matrices: W, H and norms
    display as text "Creating frames W, H and error to hold results."

    // Create empty new frames to hold matrices
    foreach outputFrame in W H error {

        // If a frame with the given name exists - drops and recreates it
        capture confirm frame `outputFrame'
        if !_rc {
            frame drop `outputFrame'
            frame create `outputFrame'
        }
        else {
            frame create `outputFrame'
        }

        // Populate each frame with the returned matrix object
        frame `outputFrame' {
            quietly svmat `outputFrame'
        }
    }

    // Add iteration identifier to norms frame
    frame error {
        rename error1 epoch
        rename error2 total_loss
        rename error3 average_loss
        local ymax = average_loss[2]
    }

    // Plots the normalisation values calculated after each iteration
    if "`graph'" != "nograph" {

        if "`loss'" == "is" local lossString "Itakura-Saito Divergence" 
        if "`loss'" == "kl" local lossString "Generalized Kullback-Leibler Divergence"
        if "`loss'" == "eu" local lossString "Frobenius (Euclidean) Error "

        frame error: graph twoway line average_loss epoch if epoch > 0,                                                                                      ///
                                                      title("Loss Function")                                                                                 ///
                                                      xtitle("Epoch") xlabel(, labsize(*0.75) grid glcolor(gs15))                                            ///
                                                      ytitle("Mean `lossString'") yscale(range(0 .)) ylabel(#5, ang(h) labsize(*0.75) grid glcolor(gs15))    ///
                                                      graphregion(color(white))
    }

    display as text "Returning final matrices in r(W), r(H) and r(error)"

    // Return final results
    return matrix W W
    return matrix H H
    return matrix error error

end

version 17
mata:

void nmf(string scalar varlist, 
         real scalar k, 
         real scalar epoch, 
         string scalar initial,
         string scalar method,
         string scalar loss,
         real scalar stop)
{
    // Declare all variable types
    real matrix A, W, H
    real colvector norm, norms
    real scalar e, i, normResult, stopMeasure, hasMissing
    string scalar lossMethodString

    // Construct mata matrix from input varlist
    A = st_data(., varlist)
    
    // Perform error checking:
    // 1. Ensure that input matrix is non-negative (i.e. no negative values)
    if (min(A) < 0) {
        _error("The input matrix must not contain negative values.")
    }

    // 2. Ensure that there are no missing values in the input matrix
    if (hasmissing(A) == 1) {
        //_error("The input matrix must not contain missing values.")
        hasMissing = 1
    }
    else {
        hasMissing = 0
    }

    // 3. Ensure that specified rank, k,  is valid (i.e. 2 < k < rank(A))
    if (k < 2 | k >= cols(A) | k >= rows(A)) {
        _error("The rank of the output matrix must be at least 2, but less than the number of rows and columns of the input matrix, A.")
    }

    // Warn about use of nndsvd with mu
    if (initial == "nndsvd" & method == "mu") {
        printf("Warning: the matrices H and W initialized by NNDSVD may contain zeroes.\n") 
        printf("These will not be updated duing multiplicative updating.\n")
        printf("For better results, select a different initializstion or update method..\n")
    }

    // Initialisation of matrix
    if (initial == "randomu" | initial == "randomn") {
        randomInit(A, k, W, H, initial)
    }
    else if (initial == "nndsvd" | initial == "nndsvda" | initial == "nndsvdar") {
        nnsvdInit(A, k, W, H, initial)
    }
    else {
        printf("Error - an invalid initialisation type has been set.")
    }
  
    // Create a column matrix (rows = epochs) to store normalisation results from each iteration
    normResult = lossDivergence(A, W, H, loss, hasMissing)
    norms = J(1, 3, .)
    norms[1, 1] = 0
    norms[1, 2] = normResult
    norms[1, 3] = normResult / (length(A) - missing(A))

    // Updating matrices the given number of iterations
    displayas("text")
    printf("\nFactorizing matix...{space 29} Maximum number of epochs set at: %9.0f \n\n", epoch)

    printf("{hline 10}{c TT}{hline 26}{c TT}{hline 17}{c TT}{hline 16}{c TT}{hline 19}\n")
    printf("{txt}{space 4}Epoch {c |}{space 12}Loss Function {c |}{space 5}Total Error {c |}{space 5}Mean Error {c |}{space 5}Relative Error\n")
    printf("{hline 10}{c +}{hline 26}{c +}{hline 17}{c +}{hline 16}{c +}{hline 19}\n")
    //printf("{txt}%12s {c |} {res}%10.0g %10.0g\n", varname[i], coef[i], se[i])


    // Update W and H by chosen update method
    for (i = 1; i <= epoch; i++) {
        if (method == "mu") {

            if (loss == "eu") {
                if (hasMissing == 1) {
                    mu_eu_missing(A, W, H)
                }
                else {
                    mu_eu(A, W, H)
                }
            }
            else if (loss == "kl") {
                if (hasMissing == 1) {
                    mu_kl_missing(A, W, H)
                }
                else {
                    mu_kl(A, W, H)
                }
            }
            else if (loss == "is") {
                if (hasMissing == 1) {
                    _error("The chosen loss function is not supported when values are missing.")
                }
                else {
                    mu_is(A, W, H)
                }              
            }
        }
        else {
            _error("An invalid method has been chosen.")
        }
        
        // Calculate divergence (error) metric
        // This could be done every 5, 10 etc iterations ('trace' parameter)?
        normResult = lossDivergence(A, W, H, loss, hasMissing)

        // Update norms matrix with result of current iteration
        norm = J(1, 3, .)
        norm[1, 1] = i
        norm[1, 2] = normResult
        norm[1, 3] = normResult / (length(A) - missing(A))
        norms = norms \ norm
        
        // Print result of iteration to the screen
        if (loss == "is") lossMethodString = "Itakura-Saito"
        if (loss == "kl") lossMethodString = "Kullback-Leibler"
        if (loss == "eu") lossMethodString = "Frobenius (Euclidean)"

        if (mod(i, 10) == 0 | i == epoch) {
            // Calculate relative error
            relError = (norms[i - 1, 2] - norms[i, 2]) / norms[2, 2]

            // Print results to the screen
            printf("%9.0f {c |} %24s {c |}%16.2f {c |}%15.4f {c |}%19.6f\n", i, lossMethodString, normResult, norm[1, 3], relError)
        }
        
        // Implement stopping rule if one is set (i.e. stop > 0)
        // Checks every 10 iterations whether MU should contine or if it should stop.
        if (stop != 0 & mod(i, 10) == 0){
            // Calculate stopping measure
            // if ((previous error - current error) / error at initiation) < stop tolerance) then stop
            stopMeasure = (norms[i - 1, 2] - norms[i, 2]) / norms[2, 2]
            if (relError < stop)
            {
                printf("{hline 10}{c BT}{hline 26}{c BT}{hline 17}{c BT}{hline 16}{c BT}{hline 19}\n")
                printf("{result}\nStopping at epoch " + strofreal(i) + "...\n")
                printf("{result}Criteria for early stoping have been met. Error reduced to: " + strofreal(stopMeasure) + ", which < the stopping threshold (" + strofreal(stop) +").\n\n")
                break
            }
        }    
        
        // If reach end of set iterations and model has not converged: warn user
        if (i == epoch) {
            printf("{hline 10}{c BT}{hline 26}{c BT}{hline 17}{c BT}{hline 16}{c BT}{hline 19}\n")
            printf("{error}\nMatrix decomposition did not converge within the set number of epochs.\n") 
            printf("Please consider increasing the number of epochs used during decomposition.\n")
        }
    }

    printf("{text}\n")


    // Return results object to stata
    st_matrix("r(W)", W)
    st_matrix("r(H)", H)
    st_matrix("r(norms)", norms)
}


void randomInit(real matrix A, 
                real scalar k, 
                real matrix W, 
                real matrix H,
                string scalar initial)
{
    if (initial == "randomu") {
        // Generate random values for W in the range [min(A), max(A)]
        W = runiform(rows(A), k, min(A), max(A))

        // Generate random values for H in the range [min(A), max(A)]
        H = runiform(k, cols(A), min(A), max(A))

    } 
    else if (initial == "randomn") {
        // Generate random values for W in the range [1 - 2]
        W = rnormal(rows(A), k, 2, 1)

        // Generate random values for H in the range [1 - 2]
        H = rnormal(k, cols(A), 2, 1)
    }
}

void nnsvdInit(real matrix A, 
               real scalar k, 
               real matrix W, 
               real matrix H, 
               string scalar initial)
{
    // Declare all variable types used in this function
    real matrix U, S, V
    real colvector ui, vi, ui_pos, ui_neg, vi_pos, vi_neg, _ui, _vi
    real scalar i, ui_pos_norm, ui_neg_norm, vi_pos_norm, vi_neg_norm, norm_pos, norm_neg, sigma, averageOfInputMatrix

    // Perform SVD and transpose resulting S matrix
    svd(A, U, S, V)
    S = S'

    // Set up empty matrices of the correct size for W and H
    W = J(rows(A), k, 0)
    H = J(k, cols(A), 0)
    
    // Get first column of W values based on SVD results
    W[., 1] = sqrt(S[1, 1]) :* abs(U[., 1])

    // Get first row of H values based on SVD results
    H[1, .] = sqrt(S[1, 1]) :* abs(V[., 1]')

    for (i = 2; i <= k; i++) {

        ui = U[., i]
        vi = V[., i]

        // Divide into positive-only and negative-only matrices
        ui_pos = (ui :>= 0) :* ui
        ui_neg = (ui :< 0) :* -ui
        vi_pos = (vi :>= 0) :* vi
        vi_neg = (vi :< 0) :* -vi

        // Calculate 2-norm of each of the positive and negative columns
        ui_pos_norm = norm(ui_pos, 2)
        ui_neg_norm = norm(ui_neg, 2)
        vi_pos_norm = norm(vi_pos, 2)
        vi_neg_norm = norm(vi_neg, 2)

        // Multiply the positive and negative norms to get overall values
        norm_pos = ui_pos_norm * vi_pos_norm
        norm_neg = ui_neg_norm * vi_neg_norm

        // Check which is larger, norm_pos or norm_neg
        if (norm_pos >= norm_neg) {
            _ui = ui_pos :/ ui_pos_norm
            _vi = vi_pos :/ vi_pos_norm
            sigma = norm_pos
        }
        else {
            _ui = ui_neg :/ ui_neg_norm; 
            _vi = vi_neg :/ vi_neg_norm;
            sigma = norm_neg
        }
        
        // Update rows of W and H
        W[., i] = sqrt(S[1, i] * sigma) * _ui
        H[i, .] = sqrt(S[1, i] * sigma) * _vi'
    }

    // Replaces zero values in initialised matrices with average value of input matrix, A
    if (initial == "nndsvda") {
        
        // Calculate properties of input matrix
        averageOfInputMatrix = matrixMean(A)

        // Update zeros with average values of input matrix
        W = editvalue(W, 0, averageOfInputMatrix)
        H = editvalue(H, 0, averageOfInputMatrix)

    }
    // Replace zeros in initialised matrices with random value in the space [0 : average/100]
    else if (initial == "nndsvdar") {

        // Calculate properties of input matrix
        averageOfInputMatrix = matrixMean(A)

        // Update zeroes in W with averge vales, scaled
        W = editvalue(W, 0, averageOfInputMatrix * runiform(1, 1) / 100)
        H = editvalue(H, 0, averageOfInputMatrix * runiform(1, 1) / 100)
    }
    // Replace zeros in initialised matrices with the smallest possible positive value
    else {
        W = editvalue(W, 0, epsilon(1))
        H = editvalue(H, 0, epsilon(1))
        
    }
}

scalar lossDivergence(real matrix A,
                      real matrix W,
                      real matrix H,
                      string scalar loss,
                      real scalar hasMissing)
{
    // Declare all variable types
    real matrix div, M
    real scalar divergence

    if (hasMissing == 0){

        // Logic flow based on parameter passed to nmf()
        if (loss == "is") {
            // is = Itakura-Saito divergence (only if no zero/missing values)
            div = A :/ (W*H)
            divergence = sum(div) - (rows(A) * cols(A)) - sum(log(div))
        }
        else if (loss == "kl") {
            // kl = Generalized Kullback-Leibler divergence
            divergence = sum(A :* log(A :/ ((W*H)) :+ epsilon(1)) :- A :+ (W*H))
        }
        else if (loss == "eu") {
            // eu = Frobenius (or Euclidean) norm
            divergence = sum((A - W*H) :^ 2)
        }
        else {
            _error("Invalid value of loss function supplied.")
        }
    }

    else if (hasMissing == 1) {
        if (loss == "is") {
            // is = Itakura-Saito divergence (only if no zero/missing values)
            _error("This distance metric is not supported for missing data.")
        }
        else if (loss == "kl") {
            // kl = Generalized Kullback-Leibler divergence
            M = (A :!= .)
            divergence = sum((M :* A) :* log((M :*A) :/ (M :* (W*H)) :+ epsilon(1)) :- (M :* A) :+ (M :* (W*H)))
        }
        else if (loss == "eu") {
            // eu = Frobenius (or Euclidean) norm
            M = (A :!= .)
            divergence = sqrt(sum(( M :* (A - W*H)) :^ 2))
        }
        else {
            _error("This distance metric is not supported for missing data.")
        }

    }

    return(divergence)
}

void mu_eu(real matrix A,
           real matrix W, 
           real matrix H)
{
    // References: 
    // [1]    Lee, D. and Seung, H. Algorithms for Non-negative Matrix Factorization.
    //        Advances in Neural Information Processing Systems 13: Proceedings of the 2000 Conference,
    //        MIT Press. pp. 556â€“562, 2000.
    // [2]    Lee, D. and Seung, H. Learning the parts of objects by non-negative matrix factorization. 
    //        Nature 401, pp. 788â€“791 (1999).

    // Update H
    H = H :* ((W' * A) :/ (W' * W * H))

    // Update W
    W = W :* ((A * H') :/ (W * H * H'))
}

void mu_eu_missing(real matrix A,
                   real matrix W, 
                   real matrix H)
{
    real matrix M
    
    M = (A :!= .)
    _editmissing(A, 0)

    // Update H
    H = H :* ((W' * (M :* A)) :/ (W' * (M :* (W * H))))

    // Update W
    W = W :* (((M :* A) * H') :/ ((M :* (W * H)) * H'))
}


void mu_kl(real matrix A,
           real matrix W, 
           real matrix H)
{
    // Update H
    H = H :* ((W' * (A :/ (W*H))) :/ (W' * J(rows(W), cols(H), 1)))
    
    // Update W
    W = W :* (((A :/ (W*H)) * H') :/ (J(rows(W), cols(H) , 1) * H'))
    
}

void mu_kl_missing(real matrix A,
                   real matrix W, 
                   real matrix H)
{
    real matrix M
    
    M = (A :!= .)
    _editmissing(A, 0)

    // Update H
    W = (W :/ (M * H')) :* (((M :* A) :/ (W * H)) * H')

    // Update W
    H = (H :/ (W' * M)) :* (W' * ((M :* A) :/ (W * H)))
}

void mu_is(real matrix A,
           real matrix W, 
           real matrix H)
{
    real matrix WH

    // Update H
    WH = W*H
    H = H :* ((W' * (A :/ ((WH :^ 2) :+ epsilon(1)))) :/ (W' * ((WH :+ epsilon(1)) :^ -1)))
    
    // Update W
    WH = W*H
    W = W :* (((A :/ ((WH :^ 2) :+ epsilon(1))) * H') :/ (((WH :+ epsilon(1)) :^ -1) * H'))
}

real scalar matrixMean(real matrix A)
{
    // Returns mean of matrix (assumes no misisng values)
    return(sum(A) / length(A))
}

end