*! version 1.1.2 20feb2024 I I Bolotov program define tmpinv, eclass byable(recall) version 16.0 /* The program implements a non-iterated Transaction Matrix (TM)-specific LPLS estimator for linear programming with the help of the Moore-Penrose inverse (pseudoinverse), calculated using singular value decomposition (SVD). Estimation using 2x2 to 50x50 contiguous submatrices, repeated with compensatory slack variables until NRMSE is minimized in a given number of iterations, is followed by an F-test from linear regression/ t-test of mean NRMSE from a pre-simulated distribution (Monte-Carlo, 50,000 iterations with matrices consisting of normal random variates). The result is adjusted for extreme values to match the RHS via shares of estimated row/column sums if the corresponding option is specified. Author: Ilya Bolotov, MBA, Ph.D. Date: 30 September 2022 */ tempname t title rspec cspec // syntax syntax /// anything [if] [in] [, /// Values(string) Slackvars(string) /// ZERODiagonal ADJustment(string) /// SUBMatrix(int 2) TOLerance(real 0) Level(cilevel) /// DISTribution TRACE ITERate(int 1) /// ] // adjust and preprocess options if `"`values'"' != "" & "`zerodiagonal'" != "" { // either cOLS or TM option di as err "options values and zerodiagonal are mutually incompatible" exit 198 } loc submatrix = cond(`submatrix' >= 2, `submatrix', 2)/* minimum is 2 */ loc submatrix = cond(`submatrix' <= 50, `submatrix', 50)/* maximum is 50 */ loc adjustment = cond(ustrregexm("`adjustment'", "^\s*ave" ), 1.5, 0) + /// cond(ustrregexm("`adjustment'", "^\s*row" ), 1, 0) + /// cond(ustrregexm("`adjustment'", "^\s*col" ), 2, 0) /// // select a subset of the data (if applicable) if `"`if'`in'"' != "" { preserve qui keep `if' `in' } // perform the pinv estimation via SVD sca `t' = clock(c(current_date) + " " + c(current_time), "DMYhms") mata: tmpinv( /// `"`anything'"', `"`values'"', `"`slackvars'"', /// ("`zerodiagonal'" != "" ? 1 : 0), ("`trace'" != "" ? 1 : 0), /// `submatrix', `iterate', `adjustment', /// (`tolerance' ? `tolerance' : .), `level' /// ) sca `t' = clock(c(current_date) + " " + c(current_time), "DMYhms") - `t' // print output di as txt _n "Converged in `=round(`t'/1000, 0.001)'s, solution stored" /// _c " in{stata ret li: r(solution)}" _n cap confirm mat r(tests) if ! _rc { loc `title' = cond(`submatrix' <= 2, "F-test, linear regression:", /// "t-tests of mean NRMSE in submatrices, MC sample (50,000):") loc `rspec' = "& - `= "& " * rowsof(r(tests))'" loc `cspec' = "& %2.0f & %2.0f | %9.0g | " + cond(`submatrix' <= 2, /// "%12.8f | %8.4f & %10.2g & %5.0f & %5.0f & %5.4f &", /// "%12.8f | %8.4f & %8.0g & %5.4f & %5.4f & %5.4f &") di as res _n "``title''" matlist r(tests), names(col) rspec(``rspec'') cspec(``cspec'') } cap confirm mat r(nrmse_dist) if ! _rc & "`distribution'" != "" { di as res _n "percentiles, MC sample (50,000):" loc `rspec' = "& - `= "& " * rowsof(r(nrmse_dist))'" loc `cspec' = "`= "& %6.2f " * 10'&" matlist r(nrmse_dist), names(col) rspec(``rspec'') cspec(``cspec'') } end version 16.0 loc RS real scalar loc RV real colvector loc RM real matrix loc SS string scalar loc TM transmorphic matrix loc PF pointer(`RM' function) scalar loc VV void loc version "version 16:" mata: mata set matastrict on `VV' tmpinv(`SS' rhs, `SS' rhs_v, `SS' lhs_s, |`RS' f_d, `RS' f_t, `RS' lim, `RS' iter, `RS' adj, `RS' tol, `RS' lvl) /* solver */ { `RS' r, c, i, j, mc_N, r2_c, r2_v `RV' v, e `RM' RHS, RHS_V, LHS_S, D, C, M, S, d, a, b, X, T `TM' tmp, f // general configuration tmp = _stata("`version' tsset, noq", 1) RHS = (! _stata("`version' confirm numeric var " + /* RHS */ regexr(rhs, ".+\.", ""), 1) ? st_data(., (! tmp ? _st_tsrevar(tokens(rhs)) : tokens(rhs))) : (length(tokens(rhs)) == 1 ? st_matrix(rhs) : J(0,0,.)) ) RHS_V = (! _stata("`version' confirm numeric var " + regexr(rhs_v, ".+\.", ""), 1) ? st_data(., (! tmp ? _st_tsrevar(tokens(rhs_v)) : tokens(rhs_v))) : (length(tokens(rhs_v)) == 1 ? st_matrix(rhs_v) : J(0,1,.)) ) LHS_S = (! _stata("`version' confirm numeric var " + /* LHS */ regexr(lhs_s, ".+\.", ""), 1) ? st_data(., (! tmp ? _st_tsrevar(tokens(lhs_s)) : tokens(lhs_s))) : (length(tokens(lhs_s)) == 1 ? st_matrix(lhs_s) : J(0,0,.)) ) f_d = f_d != . ? f_d : 0 /* flag: zero-diagonal */ f_t = f_t != . ? f_t : 0 /* flag: trace */ lim = lim != . ? lim : 50 /* max submatrix size */ iter = iter != . ? iter : 1 /* iterations */ adj = adj != . ? adj : 0 /* flag: adjustment */ tol = tol != . ? tol : . /* tolerance */ lvl = lvl != . ? lvl : c("level") /* confidence level */ mc_N = 50000 /* Monte-Carlo sample */ /* check the dimensions of RHS, V, S (the elements of `a`), and `b` */ if (! rows(rhs) | (rows(RHS) & cols(RHS) != 2) | (rows(RHS_V) & cols(RHS_V) != 1) | (rows(LHS_S) & cols(LHS_S) != 2)) { errprintf("Error: TM requires two columns in `b` and `S`, one in `V`\n") exit(503) } if (strtrim(rhs) != "" & ! rows(RHS) ) { errprintf("Error: variabe/matrix in RHS not found/ambiguous\n") exit(111) } if (strtrim(rhs_v) != "" & ! rows(RHS_V)) { errprintf("Error: variabe/matrix in values() not found/ambiguous\n") exit(111) } if (strtrim(lhs_s) != "" & ! rows(LHS_S)) { errprintf("Error: variabe/matrix in slackvars() not found/ambiguous\n") exit(111) } if (rows(RHS_V) & (length(RHS_V) != sum(rowmissing(RHS[,1]) :== 0) * sum(rowmissing(RHS[,2]) :== 0)) | rows(LHS_S) & (length(LHS_S) != length(RHS))) { errprintf("Error: TM requires the same number of rows in LHS and RHS\n") exit(503) } // estimate using limxlim contiguous submatrices and t-tests of mean NRMSE X = J(sum(rowmissing(RHS[,1]) :== 0),sum(rowmissing(RHS[,2]) :== 0),.) T = J(0,18,.) st_framecreate(f=st_tempname()) /* create a hidden frame */ (void) _stata("`version' findfile tmpinv.mmat",1)/* load MC distribution */ D = fgetmatrix((tmp=fopen(st_global("r(fn)"), "r"))) fclose(tmp) if (rows(RHS) > lim) lim-- /* choose matrix size */ for(i = 0; i < ceil(sum(rowmissing(RHS[,1]) :== 0) / lim); i++) { for(j = 0; j < ceil(sum(rowmissing(RHS[,2]) :== 0) / lim); j++) { /* dimensions of a contiguous submatrix/full problem */ d = (lim*i+1),(lim*j+1)\ min((lim*i+lim, sum(rowmissing(RHS[,1]) :== 0))), min((lim*j+lim, sum(rowmissing(RHS[,2]) :== 0))) r = d[2,1] - d[1,1] + (rows(RHS) > lim ? /*subM*/ 2 : /*full*/ 1) c = d[2,2] - d[1,2] + (rows(RHS) > lim ? /*subM*/ 2 : /*full*/ 1) /* selection from known values (if applicable) */ if (rows(RHS_V)) v = colshape((colshape((RHS_V\J(rows(X)*cols(X)- rows(RHS_V),1,.))[ colshape(colshape(1..length(X),cols(X))[|d|],1) ],c-(tmp=rows(RHS) > lim ? 1 : 0)), J(r-tmp,tmp,.)\J(tmp,c,.)), 1) /* M, C, S -> `a` */ C = I(r)#J(1,c,1)\J(1,r,1)#I(c) S = (rows(LHS_S) ? LHS_S[1..r,1]\ LHS_S[1..c,2] : J(r+c,0,.)) M = (f_d ? (rows(RHS) > lim ? /*subM*/((tmp=(I(min((r,c)))#(1,J(1,c,0)))[,(1..min((r,c))*c)]), J(rows(tmp),r*c-cols(tmp),0))[|(1\min((r,c))-1),(.\.)|] : /*full*/((tmp=(I(min((r,c)))#(1,J(1,c,0)))[,(1..min((r,c))*c)]), J(rows(tmp),r*c-cols(tmp),0))) : (rows(RHS) > lim ? /*subM*/(rows(RHS_V) ? I(r*c)[|(1\rows(v)),(.\.)|] : J(0,r*c,.)) : /*full*/(rows(RHS_V) ? I(r*c) : J(0,r*c,.)))) a = C,(rows(S) ? S : J(rows(C),0,.))\ (rows(M) ? M,J(rows(M),cols(S),0) : J(0,cols(C)+cols(S),.)) C = S = M = . /* clear memory */ /*`b` -> (, 1) */ if (rows(RHS) > lim) { /*subM*/(tmp=RHS)[|d[,1],(1\1)|] = J(r-1,1,0) (tmp )[|d[,2],(2\2)|] = J(c-1,1,0) b = RHS[|d[,1],(1\1)|]\colsum(tmp[,1])\ RHS[|d[,2],(2\2)|]\colsum(tmp[,2])\ (f_d ? J(min((r,c))-1,1,0) : (rows(RHS_V) ? v : J(0,1,.))) } else /*full*/ b = RHS[1..r,1]\ RHS[1..c,2]\ (f_d ? J(r,1,0) : RHS_V) /* missing values in `a` and `b` */ a = select(a, (tmp=rowmissing(a)+rowmissing(b)) :== 0) b = select(b, (tmp) :== 0) /* SVD-based solution, repeated until NRMSE is minimized */ e = sqrt(cross( (tmp=(b-a*(S=svsolve(a,b,.,tol)))[1..r+c]),tmp) / rows(b) / variance(b)) if(lim > 2 & rows(RHS_V)) { /* if `V` are defined */ S = (v\J(cols(LHS_S)!=0,1,0)),S e = .\e while((iter=iter == 1 ? c("maxiter") : iter) & e[rows(e)] >= e[2] & rows(e) < iter + 2) { e = e\sqrt(cross((tmp=(b-a*(S=S,svsolve( (a,(J(r+c,1,0)\select( (tmp=S[,max((1,cols(S)-1))]-S[,cols(S)])[1..r*c], rowmissing(v) :== 0))), b,., tol )[1..cols(a)])[,cols(S)])[1..r+c]),tmp) / rows(b) / variance(b)) } } e = (iter > 1 ? iter : rows(e)-(cols(S) > 1)*2), e[(tmp=selectindex(e :== min(e))[1])] X[|d|] = colshape((S=S[,tmp])[1..r*c],c)[|(1\d[2,1]-d[1,1]+1), (1\d[2,2]-d[1,2]+1)|] /* determined and overdetermined TM: F-test from Stata's regress */ tmp = (tmp=select(D,(D[,1] :== r :& D[,2] :== c :& D[,3] :== f_d)))\ J((tmp == J(0,cols(D),.)),cols(D),.) if (lim <= 2 & cols(a) <= rows(b)){ st_matrix("r(a)", a, "hidden") st_matrix("r(b)", b, "hidden") (void) _stata("`version' frame "+f+": svmat r(b), n(b)", 1) & _stata("`version' frame "+f+": svmat r(a), n(a)", 1) & _stata("`version' frame "+f+": reg b1 a*, noc " + "l("+strofreal(lvl)+")", ! f_t) tmp = .,e[2],tmp[6..14],st_numscalar("e(r2)"), st_numscalar("e(F)"),st_numscalar("e(df_m)"), st_numscalar("e(df_r)"), Ftail(st_numscalar("e(df_m)"), st_numscalar("e(df_r)"),st_numscalar("e(F)")) /* underdetermined TM: t-test of mean NRMSE from MC distribution */ } else if (lim > 2 & (e[2]+tmp[3]) != .) { (void) _stata("`version' ttesti "+strofreal(mc_N) + " "+strofreal(tmp[4]) + " "+strofreal(tmp[5]) + " "+strofreal(e[2]) + ", l("+strofreal(lvl)+")", ! f_t) tmp = e,tmp[6..14],st_numscalar("r(t)"),st_numscalar("r(df_t)"), st_numscalar("r(p_l)"),st_numscalar("r(p)"), st_numscalar("r(p_u)") } else /* problem, NRMSE == . */ tmp = e,J(1,14,.) a = b = . /* clear memory */ T = T\(r,c,tmp) } } // adjust the solution `X` to match RHS via shares of `X`s row- or colsums if (adj) X = ((adj <= 1.5 ? editmissing(X:/quadrowsum(X) :* select(RHS[,1], /*rows*/ rowmissing(RHS[,1]) :== 0), 0) : J(rows(X),cols(X),0)) + (adj >= 1.5 ? editmissing(X:/quadcolsum(X) :* select(RHS[,2], /*cols*/ rowmissing(RHS[,2]) :== 0)',0) : J(rows(X),cols(X),0))) / (adj != 1.5 ? 1 : 2) // calculate the R-squared and R-squared for CONSTRAINTS r = sum(rowmissing(RHS[,1]) :== 0) c = sum(rowmissing(RHS[,2]) :== 0) C = I(r)#J(1,c,1)\J(1,r,1)#I(c) b = select(RHS[,1]\RHS[,2], rowmissing(RHS[,1]\RHS[,2]) :== 0) r2_c = (1 - cross((tmp=b - C * colshape(X, 1)), tmp) / cross((tmp=b :- mean(b) ), tmp)) if (rows(RHS_V)) r2_v = (1 - cross((tmp=RHS_V - colshape(X, 1)), tmp) / cross((tmp=RHS_V :- mean(RHS_V) ), tmp)) // return the solution `X`, tests `T`, and MC distribution percentiles `D` st_eclear() st_rclear() st_matrix("r(solution)", X) /* solution */ st_matrix("r(tests)", (tmp=T[,1..4],T[,14..18])) st_matrixrowstripe("r(tests)", /* tests */ (J(rows(tmp),1,""),strofreal(1::rows(tmp)))) st_matrixcolstripe("r(tests)",(lim <= 2 ? (J(9,1,""),("r","c","iter","NRMSE","r2","F","df_m","df_r","p")') : (J(9,1,""),("r","c","iter","NRMSE","t","df_t","p_l","p","p_u")'))) st_matrix("r(nrmse_dist)", (tmp=T[,4..13]), "hidden") st_matrixrowstripe("r(nrmse_dist)", /* MC distribution */ (J(rows(tmp),1,""),strofreal(1::rows(tmp)))) st_matrixcolstripe("r(nrmse_dist)", (J(10,1,""),("NRMSE","p1","p5","p10","p25","p50","p75","p90","p95","p99")')) st_numscalar("r(r2_v)", r2_v >= 0 & r2_v <= 1 ? r2_v : .) st_numscalar("r(r2_c)", r2_c >= 0 & r2_c <= 1 ? r2_c : .) } end