* ===========================================================================
* Mata Code: Poisson Pseudo-Maximum Likelihood through IRLS
* ===========================================================================

// Include reghdfe (which in turn includes ftools) --------------------------
	cap findfile "reghdfe5.mata"
	if (_rc) {
	    di as error "ppmlhdfe requires the {bf:reghdfe} package (version 6 or newer), which is not installed"
	    di as error `"    - install from {stata ssc install reghdfe:SSC}"'
	    di as error `"    - install from {stata `"net install reghdfe, from("https://github.com/sergiocorreia/reghdfe/raw/master/src/")"':Github}"'
	    exit 9
	}
	include "`r(fn)'"


// Include .mata files  -----------------------------------------------------
	local GLM "class GLM scalar"
	
	findfile "ppmlhdfe_functions.mata"
	include "`r(fn)'"
	
	findfile "ppmlhdfe_separation_simplex.mata"
	include "`r(fn)'"
	
	findfile "ppmlhdfe_separation_relu.mata"
	include "`r(fn)'"


// --------------------------------------------------------------------------
// GLM Class
// --------------------------------------------------------------------------
mata:

mata set matastrict on
mata set mataoptimize on
mata set matadebug off // (on when debugging; off in production)
mata set matalnum  off // (on when debugging; off in production)


class GLM
{
	`Varname'				depvar, touse, weight_var, offsetvar
	`Varlist'				indepvars // , fullindepvars
	// `RowVector'             not_basevar (HDFE)
	`String'				absorb, weight_type
	`RowString'				separation

	`FixedEffects'			HDFE
	`Boolean'				verbose
	`Boolean'				log
	`Integer'				init_step // Used to ensure that we run the init_*() functions in the correct order

	`String'				vcetype
	`RowString'				clustervars, base_clustervars

	// Data-related variables
	`Variable'				y, offset, true_w
	`Variables'				x
	`Integer'				k
	`RowVector'				stdev_x
	`Real'					stdev_y, min_positive_y

	// Advanced estimation/solver parameters
	`String'				initial_guess_method // Method used to determine initial values of -mu-
	`Boolean'				standardize_data
	`Boolean'				remove_collinear_variables
	`Boolean'				use_exact_solver
	`Boolean'				use_exact_partial
	`Boolean'				use_heuristic_tol
	`Real'					tolerance, start_inner_tol, target_inner_tol, realized_tolerance
	`Integer'				iter, subiter // Iteration and subiteration counts
	`Integer'				min_ok // Minimum number of "ok" iterations before declaring convergence
	`Integer'				maxiter // Maximum number of iterations in IRLS step

	// Step-halving
	`Boolean'				use_step_halving
	`Real'					step_halving_memory
	`Iter'					max_step_halving

	// Separation-related parameters
	`Integer'				num_separated
	`Real'					mu_tol

	`Real'					simplex_tol
	`Integer'				simplex_maxiter
	
	`Real'					relu_tol
	`Real'					relu_maxiter
	`Boolean'				relu_report_r2
	`String'				relu_sepvarname
	`String'				relu_zvarname
	`Boolean'				relu_debug
	`Boolean'				relu_strict
	`Boolean'				relu_accelerate

	// Methods
	`Void'					new()
	`Void'					validate_parameters()
	`Void'					init_fixed_effects()
	`Void'					init_variables()
	`Void'					init_separation()
	`Void'					solve()
	`Boolean'				inner_irls() // Returns 1 if converged else 0
}	


`Void' GLM::new()
{
	verbose = 0
	log = 1
	init_step = 0
	weight_var = weight_type = ""
	iter = subiter = 0
	// separation = J(1, 0, "")

	// Advanced estimation/solver parameters
	remove_collinear_variables = 1
	standardize_data = 1
	use_exact_solver = 0
	use_exact_partial = 0
	use_heuristic_tol = 1
	use_step_halving = 0
	step_halving_memory = 0.9
	max_step_halving = 2
	tolerance = 1e-8
	target_inner_tol = 1e-9 // Target HDFE tolerance
	start_inner_tol = 1e-4
	initial_guess_method = "simple"
	min_ok = 1 // Set to 1 or at most 2... // BUGBUG: If this is below 2, then this test will fail: savefe_advanced.do (!!)
	maxiter = 1000 // 10,000 ?

	// Separation parameters
	num_separated = 0
	separation = tokens("fe simplex relu mu")
	mu_tol = 1e-6 // Actual tolerance is this scaled by the minimum MU when y>0
	simplex_tol = 1e-12
	simplex_maxiter = 1000

	relu_tol = 1e-4
	relu_maxiter = 100
	relu_report_r2 = relu_debug = 0
	relu_zvarname = relu_sepvarname = ""
	relu_strict = 0
	relu_accelerate = 0 // 0 by default as it conflicts with the "accelerate partial" trick (by making the weights change frequently). Maybe if we make the weights more stable across iters...?
}


// --------------------------------------------------------------------------
// Validate that parameter values are not invalid
// --------------------------------------------------------------------------

`Void' GLM::validate_parameters()
{
	assert_boolean(remove_collinear_variables)
	assert_boolean(standardize_data)
	assert_boolean(use_exact_solver)
	assert_boolean(use_exact_partial)
	assert_boolean(use_heuristic_tol)

	assert_msg(0 < tolerance & tolerance <= 1, "tolerance must be a real between 0 and 1", 9001, 0)
	assert_msg(0 < start_inner_tol & start_inner_tol <= 1, "start_inner_tol must be a real between 0 and 1", 9001, 0)

	assert_in(initial_guess_method, ("simple", "ols"))
	assert_msg(min_ok >= 1, "min_ok must be a positive integer", 9001, 0)
	assert_msg(0 < maxiter)

	// Separation parameters
	assert_msg(0 < mu_tol & mu_tol < 1e-1, "mu_tol must be a real in (0, 0.1)", 9001, 0)
	assert_msg(0 < simplex_tol & simplex_tol < 1e-1, "simplex_tol must be a real in (0, 0.1)", 9001, 0)
	assert_msg(0 < simplex_maxiter, "simplex_maxiter must be a positive integer", 9001, 0)
}


// --------------------------------------------------------------------------
// Compute HDFE object
// --------------------------------------------------------------------------
`Void' GLM::init_fixed_effects()
{
	`Integer'				hdfe_verbose, num_tokens, i
	`Boolean'				check_separation
	`String'				options, key, val, cmd
	`StringRowVector'		tokens

	if (verbose > 0) printf("\n{txt}{bf:- Parsing absorb() and creating HDFE object:}\n")
	assert(++init_step == 1)
	assert_msg(depvar != "", "glm.depvar is empty")
	assert_msg(touse != "", "glm.touse is empty")
	
	// SYNTAX: fixed_effects( absvars [ , touse, weighttype, weightvar, drop_singletons, verbose])
	// Note: to set drop_singletons=0, add the "keepsingletons" option to the -absorb- string
	hdfe_verbose = verbose > 0 ? verbose - 1 : verbose
	check_separation = anyof(separation, "fe")
	HDFE = fixed_effects(absorb, touse, check_separation ? "iweight" : "", depvar, ., hdfe_verbose)
	HDFE.depvar = depvar // Otherwise, e(depvar) will be set to missing when we call HDFE.post()

	// Check for invalid suboptions within absorb()
	if (verbose > -1 & !HDFE.drop_singletons) printf("{err}warning: keeping singleton groups will keep fixed effects that cause separation\n")
	assert_msg(HDFE.residuals == "", "option {bf:residuals} not allowed", 198, 0)
	//assert_msg(st_global("s(options)") == "", sprintf("option(s) {bf:%s} not allowed",st_global("s(options)")), 198, 0)

	// Add advanced options
	options = st_global("s(options)")
	if (options != "") {
		tokens = tokens(options, " ()")
		assert_msg(!mod(cols(tokens), 4), sprintf("Invalid options: %s", options))
		num_tokens = trunc(cols(tokens) / 4)
		for (i=0; i<num_tokens; i++) {
			key = tokens[4*i+1]
			assert_msg(tokens[4*i+2] == "(")
			val = tokens[4*i+3]
			assert_msg(tokens[4*i+4] == ")")
			cmd = sprintf("cap mata: glm.%s = %s", key, val)
			stata(cmd)
			assert_msg(!st_numscalar("c(rc)"), sprintf("option {bf:%s} not allowed", key), 198, 0)
		}

		// Check that parameters are still valid
		validate_parameters()
	}

	// Update touse (might be smaller due to singletons)
	if (HDFE.drop_singletons) HDFE.save_touse()
}


// --------------------------------------------------------------------------
// Load data into Mata
// --------------------------------------------------------------------------
`Void' GLM::init_variables()
{
	if (verbose > 0) printf("\n{txt}{bf:- Loading regression variables into Mata}\n")
	assert(++init_step == 2)

	// 1) Expand factor variables in the RHS, and mark omitted variables
	stata(sprintf("ms_expand_varlist %s if %s", indepvars, touse))
	if (verbose > 0) stata("return list")
	HDFE.not_basevar = strtoreal(tokens(st_global("r(not_omitted)"))) // We want to output all coefs including the omitted ones
	indepvars = HDFE.indepvars = tokens(st_global("r(varlist)"))
	HDFE.fullindepvars = st_global("r(fullvarlist)") // tokens() ???
	HDFE.varlist = depvar, HDFE.indepvars

	// 2) Load LHS
	y = st_data(HDFE.sample, depvar)
	assert_msg(all(y :>= 0), sprintf("{err}%s must be greater than or equal to zero", depvar), 459, 0)
	
	// 3) Load RHS
	k = cols(indepvars)
	if (k) {
		_st_data_wrapper(HDFE.sample, indepvars, x=., verbose)
	}
	else {
		x = J(rows(y), 0, .)	
	}
	assert(cols(x)==k)

	// 4) Load additional variables
	offset = offsetvar != "" ? st_data(HDFE.sample, offsetvar) : J(0, 1, .)
	true_w = weight_var != "" ? st_data(HDFE.sample, weight_var) : 1

	// 5) Save memory
	// st_dropvar(HDFE.tousevar)
	// (If needed, preserve+clear the data)

	// 6) Standardize data
	if (standardize_data) {
		if (verbose > 0) printf("{txt} @@ Standardizing variables\n")
		stdev_x = reghdfe_standardize(x)
		stdev_y = reghdfe_standardize(y)
		//_edittozerotol(y, epsilon(1)) // round to zero values below macheps (2e-16) // Warning: doing this might be a bad idea, and will fail the collinear.do test
		min_positive_y = min(select(y, y:>0))
		if (verbose > -1 & min_positive_y <= 1e-6) printf("{err}warning: dependent variable takes very low values after standardizing (%g)\n", min_positive_y)
		// TODO: do we need to rescale the offset?
	}
	else {
		stdev_x = J(1, k, 1)
		stdev_y = 1
	}

	// 7) Speedup trick: Try to sort data by first FE (if not already sorted by one of the FEs)
	// Isn't it better to do this BEFORE loading all the data and creating HDFE?

	// 8) Remove collinear variables (better to do this now than to carry these variables through the separation step)
	// Note that this requires an entire partial_out() call, so it is slow
	if (remove_collinear_variables) {
		if (verbose > 0) printf("{txt} @@ Removing collinear variables\n")
		HDFE.varlist = HDFE.indepvars
		remove_collinears(HDFE, target_inner_tol, x, k, stdev_x, weight_type, weight_var, true_w, verbose) // Will modify (HDFE.not_basevar, x, k, stdev_x) accordingly, and overwrite HDFE.weights
		HDFE.varlist = depvar, HDFE.indepvars
	}
}


// --------------------------------------------------------------------------
// Detect and correct separation
// --------------------------------------------------------------------------
`Void' GLM::init_separation()
{
	`Boolean'				check_separation
	`Vector'				non_separated_obs
	`Integer'				num_drop

	assert(++init_step == 3)

	// Abort if there are no boundary observations (i.e. y > 0 always)
	if (all(y :> 0)) {
		if (verbose > 0) printf("{txt}\n $$ No boundary observations (y=0), no separation tests required.\n")
		return
	}

	// Simplex method
	check_separation = anyof(separation, "simplex") & k
	if (check_separation) {
		num_drop = simplex_fix_separation(HDFE, y, x, k, stdev_x, true_w, weight_type, weight_var, target_inner_tol, simplex_tol, simplex_maxiter, non_separated_obs=., verbose)
		if (num_drop & rows(offset)) offset = offset[non_separated_obs]
		num_separated = num_separated + num_drop
	}

	// ReLU method (also works for fixed effects and combinations of regressors and FEs)
	check_separation = anyof(separation, "relu")
	if (check_separation) {
		num_drop = relu_fix_separation(HDFE, y, x, k, stdev_x, true_w, weight_type, weight_var, target_inner_tol,
		                               relu_tol, relu_maxiter,
		                               relu_sepvarname, relu_zvarname, relu_debug, relu_report_r2,
		                               non_separated_obs=., relu_strict, relu_accelerate, verbose)
		if (num_drop & rows(offset)) offset = offset[non_separated_obs]
		num_separated = num_separated + num_drop
	}

}


// --------------------------------------------------------------------------
// Compute estimates through IRLS
// --------------------------------------------------------------------------
`Void' GLM::solve(`String' bname,
                  `String' Vname,
                  `String' nname,
                  `String' rname,
                  `String' dfrname,
                  `String' llname,
                  `String' ll_0name,
                  `String' devname,
                  `String' chi2name,
                  `String' d_name)
{
	`Variable'				mu, eta, z, resid, d
	`Variables'				data
	`Vector'				b
	`Matrix'				V
	`Integer'				N, df_r, rank, N_sep, backup_iter
	`Boolean'				check_separation
	`Boolean'				converged
	`Real'					deviance, eps, ll, ll_0, ll_0_mu, chi2
	`Vector'				separated_obs, non_separated_obs, zero_sample


	assert(++init_step == 4)

	if (verbose > 1) printf("{txt} @@ Starting GLM::solve\n")

	// Set up tolerance (used to estimate initial values and first step of IRLS)
	HDFE.tolerance = max(( start_inner_tol , tolerance ))

	// Set weights (used when setting initial values...)
	HDFE.load_weights(weight_type, weight_var, true_w, verbose) // Before, HDFE.weight was just -depvar-!

	// Initial values for -mu- (using actual weights)
		// - Reference: Generalized Linear Models and Extensions (Hardin & Hilbe) page 31
		//	 a) "initialize the fitted values to the inverse link of the mean of the response variable"
		//   b) "set the initial fitter values to (y + y̅ ) / 2"
	// TODO: is there a better way?
	if (verbose > 0) printf("{txt} @@ Setting initial values\n")
	if (initial_guess_method == "ols") {
		if (verbose > 0) printf("\n{txt} - OLS Estimates of log(1+y) as initial values")
		HDFE._partial_out(data = (log(y :+ mean(y, HDFE.weight) :/ 100 ), x), ., 0, ., 1) // Don't standardize vars; flush aux vectors // Bugbug why divide mean by 100? to make it small?
		reghdfe_solve_ols(HDFE, data, b=., V=., N=., rank=., df_r=., mu=., ., "vce_none") // mu instead of resid, to save space
		mu = exp(log(y :+ 1) - mu)
	}
	else {
		mu = 0.5 * (y :+ mean(y, HDFE.weight))
	}

	// Run IRLS algorithm (note that -mu- is both an input and output!)
	check_separation = anyof(separation, "mu")

	converged = inner_irls(mu, eta, check_separation, data=., z=., deviance=., eps=., separated_obs=J(0, 1, .))
	assert_msg(converged, sprintf("{err}Failed to converge in %4.0f iterations (eps=%9.6e){txt}\n", maxiter, eps), 430, 0)

	// Post-IRLS check for separation
	N_sep = rows(separated_obs)
	if (N_sep) {
		assert(check_separation)
		data = . // Conserve memory
		num_separated = num_separated + N_sep

		if (verbose > -1) printf("{txt}(IRLS step detected %g separated observation%s)\n", N_sep, N_sep > 1 ? "s" : "")
		non_separated_obs = trim_separated_obs(HDFE, y, x, weight_type, weight_var, true_w, separated_obs, verbose)
		// Note that we might separate more than N_sep obs. due to possible new singletons
		mu = mu[non_separated_obs]
		eta = eta[non_separated_obs]
		z = z[non_separated_obs]
		if (rows(offset)) offset = offset[non_separated_obs]
		remove_collinears(HDFE, target_inner_tol, x, k, stdev_x, weight_type, weight_var, true_w, verbose) // Will modify (HDFE.not_basevar, x, k, stdev_x) accordingly, and overwrite HDFE.weights

		// Re-run IRLS with trimmed data
		backup_iter = iter
		converged = inner_irls(mu, eta, 0, data=., z=., deviance=., eps=., separated_obs=J(0, 1, .))
		iter = iter + backup_iter
		assert_msg(converged, sprintf("{err}Failed to converge in %4.0f iterations (eps=%9.6e){txt}\n", maxiter, eps), 430, 0)
	}

	if (verbose > -1 & log) printf("{txt}{hline 108}\n")
	if (verbose > -1 & log) printf("{txt}(legend: {res}p{txt}: exact partial-out   {res}s{txt}: exact solver   {res}h{txt}: step-halving   {res}o{txt}: epsilon below tolerance)\n")
	if (verbose > -1) printf("{txt}Converged in %g iterations and %g HDFE sub-iterations (tol =%4.0e)\n", iter, subiter, tolerance)
	st_local("ic", strofreal(iter))
	st_local("ic2", strofreal(subiter))


	// Compute results
	if (verbose > 0) printf("{txt} @@ Computing DoF\n")
	HDFE.vcetype = vcetype
	HDFE.clustervars = tokens(clustervars)
	HDFE.base_clustervars = tokens(base_clustervars)
	HDFE.num_clusters = length(HDFE.clustervars)
	HDFE.estimate_dof()

	if (verbose > 0) printf("{txt} @@ Computing final betas and standard errors\n")

	// Pseudo log likelihood
	// We use lngamma() because lnfactorial() doesn't work with non-integers
	resid = y :* stdev_y :* (eta:+ log(stdev_y)) - mu :* stdev_y - lngamma(y :* stdev_y :+ 1)
	//resid = y :* stdev_y :* (log(mu) :+ log(stdev_y)) - mu :* stdev_y - lngamma(y :* stdev_y :+ 1) // Not as accurate on extreme cases, makes collinear2.do fail
	
	zero_sample = selectindex(y :== 0)
	resid[zero_sample] = -mu[zero_sample] :* stdev_y
	ll = quadsum(resid :* true_w)
	resid = .

	// Alternative; using the fact that LL = MAX_LL - Deviance / 2
	// resid = y :* stdev_y :* (log(y) :+ log(stdev_y) - 1) - lngamma(y :* stdev_y :+ 1)
	// resid[zero_sample] = -y[zero_sample] :* stdev_y
	// ll = quadsum(resid :* true_w) - deviance / 2

	// Pseudo Log likelihood of constant-only model
	ll_0_mu = mean(y, true_w)
	resid = y :* (stdev_y * (log(ll_0_mu) + log(stdev_y))) :- (ll_0_mu * stdev_y) :- lngamma(y :* stdev_y :+ 1)
	zero_sample = selectindex(y :== 0)
	resid[zero_sample] = J(rows(zero_sample), 1, -ll_0_mu * stdev_y)
	ll_0 = quadsum(resid :* true_w)
	resid = .

	// Prepare to recover _cons
	HDFE.compute_constant = 1
	// Note: mu :* true_w == HDFE.weight
	HDFE.means = mean(log(mu), HDFE.weight) , mean(x, HDFE.weight)
	stdev_x = stdev_x, 1

	// With fweights we need to run an ad-hoc code, equivalent to aweights+fweights
	reghdfe_solve_ols(HDFE, data, b=., V=., N=., rank=., df_r=., resid=., ., "vce_asymptotic",
	                  weight_type == "fweight" ? true_w : J(0, 1, .))
	if (rows(offset)) b[rows(b)] = b[rows(b)] - mean(offset, HDFE.weight) // mu :* true_w

	// Run this before updating weights
	if (HDFE.save_any_fe | (d_name != "")) {
		//  z =  x b + d + e	(z: working depvar)
		// zz = xx b + e 		(zz: demeaned z)
		// THUS: d = z - xb - (e = zz - xx b)

		d = z - resid :- (cols(x) ? x * b[1..(rows(b)-1)] : 0)
		d = d :- mean(d, HDFE.weight)

		if (d_name != "") {
			if (HDFE.verbose > 0) printf("{txt} @@ Storing sum of fixed effects in {res}%s{txt}\n", d_name)
			//resid = resid :- log(stdev_y)
			HDFE.save_variable(d_name, d, "Sum of fixed effects")
		}

		if (HDFE.save_any_fe) {
			if (verbose > 0 & verbose < 3) printf("\n## Storing estimated fixed effects\n")
			HDFE.store_alphas(d)
		}

		// Debugging:
		// HDFE.save_variable("weight", HDFE.weight)
		// stata("su A B weight") // Not zero mean
		// stata("su A B [iw=weight]") // Zero mean
	}

	// Rescale results
	// See https://stats.stackexchange.com/questions/175349/in-a-poisson-model-what-is-the-difference-between-using-time-as-a-covariate-or
	b[rows(b)] = b[rows(b)] + log(stdev_y)
	b = b :/ stdev_x'
	V = V :/ (stdev_x' * stdev_x)
	// resid = resid :* stdev_y // BUGBUG?

	HDFE.load_weights(weight_type, weight_var, true_w, 1) // Why verbose==1? BUGBUG

	assert(cols(data) - 1 == rows(b) - HDFE.compute_constant)
	data = .

	chi2 = HDFE.F * HDFE.df_m // Wald test; based on output by reghdfe_solve_ols()

	// Add constant
	if (verbose > 1) printf("\n{txt}## Adding _cons to varlist\n")
	assert_msg(rows(HDFE.not_basevar) == 1, "rows(S.not_basevar) == 1")
	HDFE.not_basevar = HDFE.not_basevar, 1
	HDFE.fullindepvars = HDFE.fullindepvars + " _cons"
	HDFE.indepvars = HDFE.indepvars, " _cons"

	// Add base/omitted variables
	add_base_variables(HDFE, b, V)
	// Post results
	st_matrix(bname, b')
	st_matrix(Vname, V)
	st_numscalar(nname, N)
	st_numscalar(rname, rank)
	st_numscalar(dfrname, df_r)
	st_numscalar(llname, ll)
	st_numscalar(ll_0name, ll_0)
	st_numscalar(chi2name, chi2)
	st_numscalar(devname, deviance)

	// Need to save resids if saving FEs, even if temporarily
	if (HDFE.residuals == "" & HDFE.save_any_fe) {
		HDFE.residuals = "__temp_reghdfe_resid__"
	}

	// BUGBUG ! We shouldn't need to update touse!!
	//st_dropvar(HDFE.tousevar)
	//HDFE.save_touse("", 0)
	HDFE.save_touse("", 1)
}


// --------------------------------------------------------------------------
// Iteratively Re-weighted Least Squares (IRLS) 
// --------------------------------------------------------------------------

// WARNING:
// Discussion on numerical stability: very low (but positive) values of mu
// (This is related to the "collinear2.do" test)
// If our LHS has both very high and very low values, then standardizing -y- will make the very low values *extremely low*
// Then, the ratio y/mu (and thus log(..)) can be wildly inaccurate
// Example from obs. 5 of collinear2.do:
// y=134.9833527, eta= -1.23202993 ; thus mu=0.291699846, y/mu=462.7474252 and log(y/mu)=6.137181387
// However, if stdev_y=1.17511e+14 , then mu=2.22045e-14 y/mu=51.73238502 and log(y/mu)=3.946083988
// Instead, let's do y/mu=exp(log(y)-eta)=462.7474256 and log(y/mu)=log(y)-eta= 6.137181388 , which are WAY CLOSER to the correct soln (!!)
// (The cost is of course a slower computation for y/mu)

`Boolean' GLM::inner_irls(`Variable' mu, // Initial value
                          `Variable' eta, // Will be returned (to compute log-likelihood)
                          `Boolean' check_separation,
        				  `Variables' data, // Return transformed data (y,x)
        				  `Variable' z, // Return working depvar; only used when saving FEs
        				  `Real' deviance, // Return final deviance
        				  `Real' eps, // Return eps; the convergence criteria
        				  `Vector' separated_obs)
{
	`Integer'		ok, N_sep, col, num_step_halving
	`Variable'		separation_mask, z_last, irls_w, resid, old_eta
	`Vector'		zero_sample
	`Vector'		b
	`Real'			log_septol, old_deviance, delta_deviance, alt_tol, highest_inner_tol, denom_eps, min_eta, adjusted_log_septol
	`Boolean'		iter_fast_partial, iter_fast_solver, iter_step_halving
	`String'		iter_text
	`Matrix'		last_x
	`Vector'		eps_history
	`Real'			predicted_eps

	// Sanity checks
	assert(k == cols(x))
	assert(0 < step_halving_memory & step_halving_memory < 1)

	// WARNING:
	// If the initial value of MU is too close to zero, then 
	// when computing z = "eta - 1 + y / mu" we end up with a high value
	// and that eventually leads to mu=exp(eta) being infinite
	// EG: if y/mu=1e-2, exp(z) = 2e+43 (!!!)
	censor_mu(mu, y, verbose)

	// Initialize IRLS
	highest_inner_tol = max((1e-12, min((target_inner_tol, 0.1 * tolerance)) )) // This is the *actual* target tolerance; HDFE.partial_out() will never have a tolerance higher than this
	log_septol = log(mu_tol) // sep. tolerance in terms of Mu and not Eta
	eta = log(mu)
	HDFE.load_weights("aweight", "<placeholder for mu>", y, 1) // y is just a placeholder; we'll place (true_w*mu) later
	eps = deviance  = .
	ok = N_sep = iter_step_halving = num_step_halving = 0
	separation_mask = z = z_last = J(rows(eta), 1, 0)
	zero_sample = selectindex(y :== 0)
	eps_history = J(3, 1, .)

	if (verbose > 0) {
		printf("{txt} @@ Starting IRLS\n")
		printf("{txt}    Target HDFE tolerance:{res}%-9.4e{txt}\n", highest_inner_tol)
		if (verbose > 2) stata("memory")
	}

	// Iterate
	iter = 0
	while ((ok < min_ok) & (++iter <= maxiter)) {

		iter_fast_partial = !use_exact_partial & (iter > 1)
		iter_fast_solver  = !iter_step_halving & !use_exact_solver  & k & (HDFE.tolerance > tolerance * 11)
		if (use_heuristic_tol) {
			predicted_eps = predict_eps(eps_history, eps)
			iter_fast_solver = iter_fast_solver & (predicted_eps > tolerance)
		}

		// (a) Update weights: W = μ
		if (verbose > 1) printf("{txt} @@@ HDFE.update_sorted_weights()\n")
		assert_msg(!hasmissing(mu), sprintf("mu has infinite values on iteration %g; aborting", iter), 9003, 0)
		irls_w = true_w :* mu
		HDFE.update_sorted_weights(irls_w)
		HDFE.update_cvar_objects()

		// (b) Update working variable z = η + (y - μ) / μ - offset
		//							 	 = η + y/μ - 1 - offset
		// 	   Note: lim z (μ->0; y=0) = η - 1 - offset + 0/ε
		//							   = η - 1 - offset
		if (rows(offset)) {
			z = eta - offset :- 1 + y :/ mu
			// z = eta - offset :- 1 + exp(log(y) - eta) // Perhaps more accurate?
			if (check_separation) z[zero_sample] = eta[zero_sample] - offset[zero_sample] :- 1
		}
		else {
			z = eta :- 1 + y :/ mu
			// z = eta :- 1 + exp(log(y) - eta) // Perhaps more accurate?
			if (check_separation) z[zero_sample] = eta[zero_sample] :- 1
		}

		// (c) Data is (z, X)
		if (iter_fast_partial) {
			data[., 1] = data[., 1] + z - z_last
			// data[., 1] = data[., 1] - (last_x - data[., 2..cols(data)]) * b[1..k]
		}
		else {
			data = (z, x)
		}
		//last_x = data[., 2..cols(data)]

		// (d.1) Partial out data
		if (verbose > 1) printf("{txt} @@@ HDFE._partial_out()\n")
		_edittozerotol(data, min((tolerance, 1e-12)) ) // see test: hard2.do
		if (iter > 1) (void) --HDFE.verbose
		HDFE._partial_out(data, 0, 0, 0, 1) // Don't standardize vars; flush aux vectors
		if (iter > 1) (void) ++HDFE.verbose
		_edittozerotol(data, min((tolerance, 1e-12)) )
		subiter = subiter + HDFE.iteration_count
		
		// (d.2) Solve β and compute residuals
		if (verbose > 1) printf("{txt} @@@ reghdfe_solve_ols()\n")
		if (iter_fast_solver) {
			// Faster solution (~5% runtime) when still away from converging
			b = fastsolve(data, HDFE.weight)
			if (verbose > 1) b'
			resid = data * (1 \ -b)
		}
		else {
			// Good-quality estimates when close to the solution
			reghdfe_solve_ols(HDFE, data, b=., ., ., ., ., resid=., ., "vce_none")
			if (verbose > 1) b'
		}

		if (verbose > 1) printf("{txt} @@@ updating eta/mu/deviance\n")

		// (e) Update η = z - resid = xβ + d
		if (!iter_step_halving) swap(old_eta, eta) // A faster alternative to "old_eta = eta"
		if (rows(offset)) {
			eta = z - resid + offset
		}
		else {
			eta = z - resid
		}
		if (check_separation) {
			// Add min(eta + 5| y > 0) to log_septol, if that is negative
			// Essentially, this uses a more conservative tolerance if eta is also low when y>0
			// But the effect only kicks in when eta is below -5
			adjusted_log_septol = log_septol + min(( min(select(eta, y:>0)) + 5, 0 ))

			separation_mask = separation_mask :| ( (eta :<= adjusted_log_septol) :& (y :== 0) )
			separated_obs = selectindex(separation_mask)
			N_sep = rows(separated_obs)
		}

		// (f) Update μ = exp(η)
		mu = exp(eta)
		if (N_sep) mu[separated_obs] = J(N_sep, 1, 0)
		_vector_scalar_max(mu, epsilon(100)) // the result might oscillate endlessly if mu is too close to epsilon()

		// (e) Update deviance:
		//	   Dev = 2 { Σ[μ] - Σ[y] + (y>0) * Σ[y log(y/μ)]
		swap(z, z_last)
		old_deviance = deviance
		//deviance = quadsum((mu - y) :* true_w) + quadcross(y, (y :> 0) :* true_w, log(y :/ mu) )
		deviance = quadsum((mu - y) :* true_w) + quadcross(y, (y :> 0) :* true_w, log(y) - eta )
		if (2 * deviance / rows(y) < epsilon(1)) deviance = 0 // We are within macheps accuracy of zero
		deviance = 2 * edittozerotol(deviance, epsilon(1))
		if (deviance < 0) deviance = 0
		delta_deviance = old_deviance - deviance

		// Trick: since Dev>0; in the next iteration ΔDev can't be lower than Dev
		if (!missing(delta_deviance) & (deviance < 0.1 * delta_deviance)) {
			delta_deviance = deviance
			if (verbose > 0) printf("{txt}    - note: deviance is already very close to zero\n")
		}

		// Stopping criteria:
		// - It's HARD to choose a good stopping criteria
		// - Note: unless the model has no constant, sum(y) == sum(mu) at convergence
		if (iter > 1) {

			// Alternatives:
			//eps = abs(delta_deviance) / (0.1 + deviance)							// R criterion: https://github.com/SurajGupta/r-source/blob/a28e609e72ed7c47f6ddfbb86c85279a0750f0b7/src/library/stats/R/glm.R#L302
			//eps = abs(delta_deviance)												// Stata criterion: glm.ado (line 1060)
			//eps = delta_deviance / deviance 										// Julia criterion: https://github.com/JuliaStats/GLM.jl/blob/84da7f178589ebd5aa131e92be5aff8baa9a9636/src/glmfit.jl#L262
			//eps = abs(delta_deviance) / deviance 									// Modified Julia criterion: https://github.com/JuliaStats/GLM.jl/blob/84da7f178589ebd5aa131e92be5aff8baa9a9636/src/glmfit.jl#L262
			//eps = abs(delta_deviance) / (0.1  + min((deviance, old_deviance))) 	// Criterion from version 1 of ppmlhdfe.ado

			// denom_eps = max(( min((deviance, old_deviance)) , 0.1 / stdev_y ))
			denom_eps = max(( min((deviance, old_deviance)) , 0.1 ))
			eps = abs(delta_deviance) / denom_eps
			
			// Never used:
			//eps, delta_deviance, denom_eps, ., min((deviance, old_deviance)), 0.1, 0.1 / stdev_y
			//eps = mreldif(deviance, old_deviance) // maybe this is safer?
			//eps = mean(reldif(deviance, old_deviance))
			//eps = abs(delta_deviance) / (0.1 * stdev_y + min((deviance, old_deviance)))
			//eps = abs(delta_deviance) / max(( min((deviance, old_deviance)) , epsilon(100) ))

			// Declare convergence once we have enough non-accelerated iterations where eps < tol
			if (eps < tolerance) {
				if (use_heuristic_tol & HDFE.accuracy >= 0) {
					// HDFE.accuracy can be -1 with LSMR (LSMR does not update accuracy as it uses multiple tols)
					assert(HDFE.accuracy <= HDFE.tolerance)
					if (!iter_fast_solver & (HDFE.accuracy <= 1.1 * highest_inner_tol | HDFE.G==1)) {
						ok = ok + 1
					}
				}
				else {
					if (!iter_fast_solver & (HDFE.tolerance <= 1.1 * highest_inner_tol | HDFE.G==1)) {
						ok = ok + 1
					}
				}
			}
			else if (use_step_halving & (delta_deviance < 0) & (num_step_halving < max_step_halving)) {
				// Run step-halving AFTER checking for convergence
				eta = step_halving_memory * old_eta + (1 - step_halving_memory) * eta
				if (num_step_halving > 0) update_mask(eta, selectindex(eta:<-10), -10) // If the first step halving was not enough, clip very low values of eta
				mu = exp(eta)
				iter_step_halving = 1
				ok = 0
				num_step_halving = num_step_halving + 1
			}
			else {
				iter_step_halving = 0
				num_step_halving = 0
			}

		}

		// Progress report
		if (verbose > -1 & log) {
			col = 0

			iter_text = sprintf("{txt}{col %2.0f}Iteration %g:", col, iter)
			col = col + 16

			iter_text = iter_text + sprintf("{txt}{col %2.0f}deviance = {res}%-11.5e", col, deviance * stdev_y)
			col = col + 23

			iter_text = iter_text + sprintf("{col %2.0f}{txt}eps = {res}%-9.4e{txt}", col, eps)
			col = col + 16

			iter_text = iter_text + sprintf("{col %2.0f}{txt}iters = {res}%g", col, HDFE.iteration_count)
			col = col + 13

			iter_text = iter_text + sprintf("{col %2.0f}{txt}tol ={res}%5.0e", col, HDFE.tolerance)
			col = col + 14

			min_eta = min(select(eta, !separation_mask))
			iter_text = iter_text + sprintf("{col %2.0f}{txt} min(eta) = {%s}%6.2f", col, min_eta < log_septol - 1 & !check_separation ? "err" : "res", min_eta ) // Add "& verbose>0"
			col = col + 20
			
			//iter_text = iter_text + sprintf("{txt}{col %2.0f}[{txt}%s%s%s%s{txt}] ", col, iter_fast_partial ? " " : "p", iter_fast_solver ? " " : "s", iter_step_halving ? "h" : " ", ok ? "o" : " ")
			iter_text = iter_text + "  {txt}"
			iter_text = iter_text + (iter_fast_partial ? " " : "P")
			iter_text = iter_text + (iter_fast_solver  ? " " : "S")
			iter_text = iter_text + (iter_step_halving ? "H" : " ")
			iter_text = iter_text + (ok ? "O" : " ")

			col = col + 6
			if (N_sep) iter_text = iter_text + sprintf("{col %2.0f}{txt} sep.obs. = {res}%g", col = col, N_sep)
			printf(iter_text + "\n")
		}

		// If using step halving, start a new iteration after the progress report
		if (iter_step_halving) {
			deviance = old_deviance
			continue
		}
		
		if (ok >= min_ok | ok >= 1 & deviance == 0) {
			deviance = deviance * stdev_y
			return(1) // converged=1
		}

		// As IRLS starts to converge, switch to stricter tolerances when partialling out
		if (use_heuristic_tol) {
			if (eps < HDFE.tolerance) {
				// Increase HDFE tol by at least 10x.
				// Go further if IRLS is converging fast enough.
				// But don't increase beyond the user-requested tol!
				// (Note: "max((eps, epsilon(1)))" avoids missing values when eps==0)
				HDFE.tolerance = max((min((0.1 * HDFE.tolerance, alt_tol)), highest_inner_tol))
				alt_tol = 10 ^ -ceil(log10(1 / max((0.1 * eps, epsilon(1)))  ))
			}
			if (use_exact_partial & HDFE.tolerance > tolerance) HDFE.tolerance = 0.1 * tolerance // BUGBUG??

			if (inrange(tolerance, predicted_eps, eps) & inrange(HDFE.tolerance/highest_inner_tol, 1.1, 10.1) & (HDFE.accuracy/highest_inner_tol <= 10.1)) {
				HDFE.tolerance = 0.1 * HDFE.tolerance
			}

		}
		else {

			if (eps < HDFE.tolerance) {
				// Increase HDFE tol by at least 10x.
				// Go further if IRLS is converging fast enough.
				// But don't increase beyond the user-requested tol!
				// (Note: "max((eps, epsilon(1)))" avoids missing values when eps==0)
				HDFE.tolerance = max((min((0.1 * HDFE.tolerance, alt_tol)), highest_inner_tol))
				alt_tol = 10 ^ -ceil(log10(1 / max((0.1 * eps, epsilon(1)))  ))
			}
			if (use_exact_partial & HDFE.tolerance > tolerance) HDFE.tolerance = 0.1 * tolerance // BUGBUG??

		}



		
	}

	return(0) // converged=0
}

end

exit