// Mar 12 2023
// Brendan Halpin brendan.halpin@ul.ie

version 10.0


// This file runs through most of the functionality of the SADI package
// for Stata, to demonstrate some of its uses.
//
// Because it calculates several distance matrices, it takes several minutes to run.
// 
// It uses the McVicar/Anyadike-Danes data, representing 72 months of the
// labour market history of young people in Northern Ireland. Thanks guys.
// 
// Duncan McVicar and Michael Anyadike-Danes (2002), Predicting Successful
// and Unsuccessful Transitions from School to Work Using Sequence Methods,
// Journal of the Royal Statistical Society (Series A), 165, pp317-334.

use mvad
sort id

// Use the substitution matrix from the MVAD paper
matrix mvdanes = (0,1,1,2,1,3 \ ///
                  1,0,1,2,1,3 \ ///
                  1,1,0,2,1,2 \ ///
                  2,2,2,0,1,1 \ ///
                  1,1,1,1,0,2 \ ///
                  3,3,2,1,2,0 )

set matsize 4000

// Get pairwise distance matrices for a range of different distance measures

oma         state1-state72, subsmat(mvdanes) pwd(omd) length(72) indel(1.5)
omav        state1-state72, subsmat(mvdanes) pwd(omv) length(72) indel(1.5)
sdhollister state1-state72, subsmat(mvdanes) pwd(hol) length(72) timecost(0.5) localcost(0.5)
twed        state1-state72, subsmat(mvdanes) pwd(twd) length(72) lambda(0.5) nu(0.04) 
sdhamming   state1-state72, subsmat(mvdanes) pwd(ham) 
dynhamming  state1-state72,                  pwd(dyn) 

// Prepare the data in X/t spell format and get duration-weighted combinatorial (Elzinga) distance:
cal2spell, state(state) spell(sp) length(len) nsp(nspells)
local spmax = r(maxspells)
combinadd sp1-len`spmax', pwsim(xts) nspells(nspells) nstates(6) rtype(d)

// Rather than use MVAD's substitution matrix, generate one from transition probabilities
// Note the one without the diagonal has more variation
preserve
reshape long state, i(id) j(m)
trans2subs state, id(id) subs(tpr1)
matrix list tpr1
trans2subs state, id(id) subs(tpr2) diag
matrix list tpr2
restore

// Get OMA distance with the transition probability substitution matrix
oma        state1-state72, subsmat(tpr1) pwd(tpr) length(72) indel(1.5)


// Compare all distance matrices with OMA
// corrsqm with the nodiag option gives the correlation between the distances
// between all pairs, excluding the zero distances on the diagonal
foreach dist in dyn ham twd hol omv xts tpr {
  corrsqm omd `dist', nodiag
}

// Test all the distance matrices to ensure they obey the triangle inequality
// (omv and hol do not)

foreach dist in dyn ham oma twd hol omv xts tpr {
  metricp `dist'
}

// Do cluster analysis on OMA and TWED distances
// Generate 8 and 12 cluster solutions for each
clustermat wards omd, name(oma) add
cluster generate o=groups(8 12)

clustermat wards twd, name(twd) add
cluster generate t=groups(8 12)

// Compare clusterings using Adjusted Rand Index
ari o8 t8

// Compare clusterings using permtab
// gen(pt8) creates a permuted version of t8 so that it matches o8
// as closely as possible

permtab o8 t8, gen(pt8)
tab o8 pt8


// For 12 clusters, permutation takes 9*10*11*12=11880 times as long as for 8
// To deal with this, permtab with the option algorithm(hc) yields an approximate-best permutation using
// a fast hill-climb algorithm
permtab o12 t12, gen(pt18) algo(hc)
// See also -algo(ga)- which uses a (slow) genetic algorithm

// Studer et al's discrepancy measure
// See M Studer, G Ritschard, A Gabadinho and NS Müller, Discrepancy
//  analysis of state sequences, Sociological Methods and Research,
//  40(3):471-510

sddiscrep o8, dist(omd) id(id)
sddiscrep o12, dist(omd) id(id)
sddiscrep grammar, dist(omd) id(id)
sddiscrep grammar, dist(omd) id(id) niter(1000)

// Descriptive summaries of sequences

//  - string representation of sequences
sdstripe state1-state72, gen(seqstr) symbols("EFHSTU")
list seqstr in 1/5, clean

//  - string representation of sequences: condensed format
sdstripe state1-state72, gen(seqstrxt) symbols("EFHSTU") xt xtsp("/") xtdur(":")
list seqstrxt in 1/5, clean

// Use sddiscrep to identify medoids
sddiscrep o8, dist(omd) id(id) dcg(dx) niter(1) // niter(1) since p-value not needed
sort o8 dx
by o8: gen medoid = _n==1
list o8 dx seqstrxt if medoid, clean
sort id

// - cumulated duration in each state
cumuldur state1-state72, cd(dur) nstates(6)

su dur*
table o8, c(mean dur1 mean dur2 mean dur3) format(%5.2f)
table o8, c(mean dur4 mean dur5 mean dur6) format(%5.2f)

drop dur1-dur6

// - sdentropy: A simple measure of Shannon entropy
sdentropy state1-state72, gen(ent) cd(dur) nstates(6)
table o8, c(mean ent)

// - nspells, number of spells
nspells state1-state72, gen(nsp)
tab nsp

// Graphics

// 1: Chronogram

sdchronoplot state*, by(o8, legend(off)) name(chronogram, replace)

// 2: indexplot 

//    Generate a maximal clustering (as many clusters as distinct sequences)
//    This allows us to order sequences within clusters such that 
//    subcluster-structure is preserved
cluster generate o999 = groups(750), name(oma) ties(fewer)

sdindexplot state*, by(o8, note("") legend(off)) order(o999) name(indexplot, replace)

// 3: Transition pattern
trprgr state*, gmax(485)

// A matrix of transition rates: maketrpr generates the matrix of transition rates
// that is used by dynhamming and trprgr, using tssmooth to average over a moving
// window of successive transitions. 

maketrpr state*, mat(mkt) ma(5)

matlist mkt[1..6,.]
matlist mkt[25..30,.]

// Comparison to "reference" or "ideal type" sequences
// oma, twed and dynhamming are set up to calculate distances to
// specific sequences, as an alternative to all pairwise distances
// It creates an N by M distance matrix where distances to the first M
// sequences are calculated

// Example: Begin by creating 3 ideal sequences in 3 extra rows at the start
expand 4 in 1
sort id
forvalues x = 1/72 {
  qui {
    // Seq 1: school:university:employment
    replace state`x' = 4 in 1 if inrange(`x',1,24)  // 2 years' school
    replace state`x' = 3 in 1 if inrange(`x',25,60) // 3 years' university
    replace state`x' = 1 in 1 if inrange(`x',61,72) // 1 year's employment
    // Seq 2: school:FE:unemployment:employment
    replace state`x' = 4 in 2 if inrange(`x',1,12)  // 1 year's school
    replace state`x' = 2 in 2 if inrange(`x',13,24) // 1 year's further education
    replace state`x' = 6 in 2 if inrange(`x',25,36) // 1 year's unemployment
    replace state`x' = 1 in 2 if inrange(`x',37,72) // 3 years' employment
    // Seq 3: ue:training:employment:ue:employment
    replace state`x' = 6 in 3 if inrange(`x',1,12)  // 1 year's unemployment
    replace state`x' = 5 in 3 if inrange(`x',13,24) // 1 year's training
    replace state`x' = 1 in 3 if inrange(`x',25,36) // 1 year's employment
    replace state`x' = 6 in 3 if inrange(`x',37,48) // 1 year's unemployment
    replace state`x' = 1 in 3 if inrange(`x',49,60) // 1 year's employment
    replace state`x' = 6 in 3 if inrange(`x',61,72) // 1 year's unemployment
  }
}

// View reference sequences
drop seqstr
sdstripe state1-state72, gen(seqstr) symbols("EFHSTU")
list seqstr in 1/3, clean

// For OMA, TWED and Dynamic Hamming, the commands take a "ref(n)" option
// which creates an _N*n matrix of distances to the first n sequences in the data

// For Hamming distances, use =oma= with a large indel

oma        state1-state72, subsmat(mvdanes) pwd(ham3) ref(3) length(72) indel(999)
oma        state1-state72, subsmat(mvdanes) pwd(omd3) ref(3) length(72) indel(1.5)
twed       state1-state72, subsmat(mvdanes) pwd(twd3) ref(3) length(72) lambda(0.5) nu(0.04) 
dynhamming state1-state72,                  pwd(dyn3) ref(3)

// Transfer one _N*3 matrix to variables and see how it compares with clusters based on
// all pairwise distances. 
svmat omd3
table o8, c(mean omd31 mean omd32 mean omd33) f(%6.3f)
list o8 seqstrxt omd3* if medoid, clean