// FCOLLAPSE - Aggregate Functions
// -data- vector must be already sorted by F: F.sort(data)
// Same for -weights- vector

mata:
mata set matastrict on

`Dict' aggregate_get_funs()
{
	`Dict'					funs
	funs = asarray_create("string", 1)
	asarray_notfound(funs, NULL)
	asarray(funs, "count", &aggregate_count())
	asarray(funs, "mean", &aggregate_mean())
	asarray(funs, "sum", &aggregate_sum())
	asarray(funs, "min", &aggregate_min())
	asarray(funs, "max", &aggregate_max())
	asarray(funs, "first", &aggregate_first())
	asarray(funs, "last", &aggregate_last())
	asarray(funs, "firstnm", &aggregate_firstnm())
	asarray(funs, "lastnm", &aggregate_lastnm())
	asarray(funs, "percent", &aggregate_percent())
	asarray(funs, "quantile", &aggregate_quantile())
	asarray(funs, "iqr", &aggregate_iqr())
	asarray(funs, "sd", &aggregate_sd())
	asarray(funs, "nansum", &aggregate_nansum())
	// ...
	return(funs)
}


`Matrix' select_nm_num(`Vector' data) {
	// Return matrix in case the answer is 0x0
	return(select(data, data :< .))
}


`StringMatrix' select_nm_str(`StringVector' data) {
	return(select(data, data :!= ""))
}


`DataCol' aggregate_count(`Factor' F, `DataCol' data, `Vector' weights, `String' wtype)
{
	if (wtype == "" | wtype == "aweight") {
		return( `panelsum'(data :<., 1, F.info) )
	}
	else {
		return( `panelsum'(data :<., weights, F.info) )
	}
	// Older:
	//`Integer'	            i
	//`DataCol'	            results
	//results = J(F.num_levels, 1, missingof(data))
	//for (i = 1; i <= F.num_levels; i++) {
    //    results[i] = nonmissing(panelsubmatrix(data, i, F.info))
	//}
	//return(results)
}


`Vector' aggregate_sum(`Factor' F, `Vector' data, `Vector' weights, `String' wtype)
{
	if (wtype == "") {
		return( `panelsum'(editmissing(data, 0), 1, F.info) )
	}
	else if (wtype == "aweight") {
		`Vector' sum_weights
		// normalize weights so they add up to number of obs. in the subgroup
		sum_weights = `panelsum'(weights :* (data :< .), F.info) :/ `panelsum'(data :< ., F.info)
		return( `panelsum'(editmissing(data, 0), weights, F.info) :/ sum_weights )
	}
	else {
		return( `panelsum'(editmissing(data, 0), weights, F.info) )
	}
}


`Vector' aggregate_nansum(`Factor' F, `Vector' data, `Vector' weights, `String' wtype)
{
	assert(wtype == "")
	return( `panelsum'(editmissing(data, 0), 1, F.info) :/ (`panelsum'(data :<., 1, F.info) :> 0) )
}


`Vector' aggregate_mean(`Factor' F, `Vector' data, `Vector' weights, `String' wtype)
{
	if (wtype == "") {
		return( aggregate_sum(F, data, 1, "") :/ aggregate_count(F, data, 1, "") )
	}
	else {
		// http://www.statalist.org/forums/forum/general-stata-discussion/general/289901-collapse-and-weights
		return( aggregate_sum(F, data, weights, "iweight") :/ aggregate_count(F, data, weights, "iweight") )
	}

	// Older:
	//`Integer'	            i
	//`Vector'	            results
	//results = J(F.num_levels, 1, .)
	//for (i = 1; i <= F.num_levels; i++) {
    //    results[i] = mean(panelsubmatrix(data, i, F.info), weights)
	//}
	//return(results)
}


`Vector' aggregate_min(`Factor' F, `Vector' data, `Vector' weights, `String' wtype)
{
	`Integer'	            i
	`Vector'	            results
	results = J(F.num_levels, 1, .)
	for (i = 1; i <= F.num_levels; i++) {
        results[i] = colmin(panelsubmatrix(data, i, F.info))
	}
	return(results)
}


`Vector' aggregate_max(`Factor' F, `Vector' data, `Vector' weights, `String' wtype)
{
	`Integer'	            i
	`Vector'	            results
	results = J(F.num_levels, 1, .)
	for (i = 1; i <= F.num_levels; i++) {
        results[i] = colmax(panelsubmatrix(data, i, F.info))
	}
	return(results)
}


`DataCol' aggregate_first(`Factor' F, `DataCol' data, `Vector' weights, `String' wtype)
{
	`Integer'	            i
	`DataCol'	            results
	results = J(F.num_levels, 1, missingof(data))
	for (i = 1; i <= F.num_levels; i++) {
        results[i] = data[F.info[i, 1]]
	}
	return(results)
}


`DataCol' aggregate_last(`Factor' F, `DataCol' data, `Vector' weights, `String' wtype)
{
	`Integer'	            i
	`DataCol'	            results
	results = J(F.num_levels, 1, missingof(data))
	for (i = 1; i <= F.num_levels; i++) {
        results[i] = data[F.info[i, 2]]
	}
	return(results)
}


`DataCol' aggregate_firstnm(`Factor' F, `DataCol' data, `Vector' weights, `String' wtype)
{
	`Integer'	            i
	`DataCol'	            results, tmp
	pointer(`Vector')		fp
	results = J(F.num_levels, 1, missingof(data))
	fp = isstring(data) ? &select_nm_str() : &select_nm_num()
	for (i = 1; i <= F.num_levels; i++) {
		tmp = (*fp)(panelsubmatrix(data, i, F.info))
		if (rows(tmp) == 0) continue
        results[i] = tmp[1]
	}
	return(results)
}


`DataCol' aggregate_lastnm(`Factor' F, `DataCol' data, `Vector' weights, `String' wtype)
{
	`Integer'	            i
	`DataCol'	            results, tmp
	pointer(`Vector')		fp
	results = J(F.num_levels, 1, missingof(data))
	fp = isstring(data) ? &select_nm_str() : &select_nm_num()
	for (i = 1; i <= F.num_levels; i++) {
		tmp = (*fp)(panelsubmatrix(data, i, F.info))
		if (rows(tmp) == 0) continue
        results[i] = tmp[rows(tmp)]
	}
	return(results)
}


`Vector' aggregate_percent(`Factor' F, `DataCol' data, `Vector' weights, `String' wtype)
{
	`Vector'	            results
	results = aggregate_count(F, data, weights, wtype)
	return(results :/ (quadsum(results) / 100))
}


`Vector' aggregate_quantile(`Factor' F, `Vector' data, `Vector' weights, `String' wtype, 
                            `Integer' P)
{
	`Integer'	            i
	`Vector'	            results, tmp_data, tmp_weights
	`Boolean'				has_fweight
	
	results = J(F.num_levels, 1, .)
	
	if (wtype == "") {
		for (i = 1; i <= F.num_levels; i++) {
	        // SYNTAX: mm_quantile(data, | weights, P, def, fw?, ..)
	        tmp_data = panelsubmatrix(data, i, F.info)
	        tmp_data = select(tmp_data, tmp_data :< .)
	        if (rows(tmp_data) == 0) continue
	        results[i] = mm_quantile(tmp_data, 1, P, 2)
		}
	}
	else {
		has_fweight = wtype == "fweight"
		for (i = 1; i <= F.num_levels; i++) {
	        tmp_data = panelsubmatrix(data, i, F.info)
	        tmp_weights = panelsubmatrix(weights, i, F.info)
	        tmp_weights = select(tmp_weights, tmp_data :< .)
	        tmp_data = select(tmp_data, tmp_data :< .)
	        if (rows(tmp_data) == 0) continue
	        results[i] = mm_quantile(tmp_data, tmp_weights, P, 2, has_fweight)
		}
	}

	return(results)
}


`Vector' aggregate_iqr(`Factor' F, `Vector' data, `Vector' weights, `String' wtype)
{
	`Integer'	            i
	`Vector'	            results, tmp_data, tmp_weights, P
	`RowVector'				tmp_iqr
	`Boolean'				has_fweight

	results = J(F.num_levels, 1, .)
	P = (0.25\0.75)
	
	if (wtype == "") {
		for (i = 1; i <= F.num_levels; i++) {
	        // SYNTAX: mm_iqrange(X [, w, def, fw, wd])
	        tmp_data = panelsubmatrix(data, i, F.info)
	        tmp_data = select(tmp_data, tmp_data :< .)
	        if (rows(tmp_data) == 1) results[i] = 0
	        if (rows(tmp_data) <= 1) continue
	        results[i] = mm_iqrange(tmp_data, 1, 2)
		}
	}
	else {
		has_fweight = wtype == "fweight"
		for (i = 1; i <= F.num_levels; i++) {
	        tmp_data = panelsubmatrix(data, i, F.info)
	        tmp_weights = panelsubmatrix(weights, i, F.info)
	        tmp_weights = select(tmp_weights, tmp_data :< .)
	        tmp_data = select(tmp_data, tmp_data :< .)
	        if (rows(tmp_data) == 1) results[i] = 0
	        if (rows(tmp_data) <= 1) continue
	        results[i] = mm_iqrange(tmp_data, tmp_weights, 2, has_fweight)
		}
	}

	return(results)
}


`Vector' aggregate_sd(`Factor' F, `Vector' data, `Vector' weights, `String' wtype)
{
	`Integer'	            i
	`Vector'	            results, adjustment, tmp_weights
	if (wtype == "pweight") {
		_error("sd not allowed with pweights")
	}
	results = J(F.num_levels, 1, .)

	if (wtype == "") {
		for (i = 1; i <= F.num_levels; i++) {
	        results[i] = sqrt(quadvariance(panelsubmatrix(data, i, F.info)))
		}
	}
	else {
		printf("{err}warning: option sd has not been properly tested with weights!!!!")
		for (i = 1; i <= F.num_levels; i++) {
			tmp_weights = panelsubmatrix(weights, i, F.info)
			tmp_weights = tmp_weights :/ quadsum(tmp_weights) * 1000000000 // why? bugbug
	        results[i] = sqrt(quadvariance(panelsubmatrix(data, i, F.info), tmp_weights))
		}
		adjustment = aggregate_count(F, data, 1, "")
		adjustment = sqrt(adjustment :/ (adjustment :- 1))
		results =  results :* adjustment
	}
	return(results)
}
end