/* Oct 13 2008 23:18:56 Add subsequence lengths together instead of multiply in the first stage Copyright 2007 Brendan Halpin brendan.halpin@ul.ie Distribution is permitted under the terms of the GNU General Public Licence */ /* Dec 23 2006 17:11:09 Variant of hashplugin.c to generate spell version Main principle of both versions is to enumerate all tuples individually in a first pass (O(N) ignoring length of sequences). This is done for each case using a recursive function to identify tuples and to record their count in a hash structure. The data is then transferred into a sparse-matrix equivalent structure consisting of two matrices and a vector (i.e. for each case two vectors and a scalar) recording the the number of distinct tuples observed, and the index number and count of each observed tuple. A hash is used because the number of distinct tuples possible can be enormous (e.g. 1e12) while the number actually observed will be manageable (e.g. <1e5). The hashed data is transferred into the vector/vector/scalar structure for ease of use. (It may be possible to avoid doing so by using a vector of hash tables (i.e. one per case rather than re-using the same one), and comparing pairs of hash tables in the following step.) This structure then allows a very rapid pairwise comparison: the vectors of tuple indices (which are in sort order) are compared and where common tuples are observed, the corresponding pair of counts are used in calculating the pairwise similarity score, \frac{Sxy}{\sqrt Sxx Syy}. */ #include "stplugin.h" #include /* gets */ #include /* size_t */ #include /* atoi, malloc */ #include /* strcpy */ #include #include "uthash.h" /* Critical maxima 1 longest sequence 2 max distinct tuples allowed per sequence (a harder limit than 1) 3 max number of cases */ #define MAXLENGTH 100 #define MAXTUPLES 40000 #define MAXCASES 5500 int increment_tuple_spell(unsigned long int tuple_id, unsigned long int duration); int nstates; int show_work; struct my_struct { unsigned long int id; /* key */ unsigned long int count; UT_hash_handle hh; /* makes this structure hashable */ }; struct my_struct *tupleshash = NULL; int enumeratetuples_spell (int start, int finish, unsigned long int offset, int *seqs, int *seql, unsigned long int cumdur) { int mti; unsigned long int tuple1, tuple2; char buf[80]; tuple1 = offset*nstates; for (mti=start; mti<=finish; mti++) { if ((seqs[mti]>nstates) | (seqs[mti]<1)) { return(999); } tuple2=tuple1+seqs[mti]; /* state */ if (show_work) { snprintf(buf,80,"Enum tup: %lu %lu %d\n", offset, tuple2, seql[mti]); SF_display(buf); } increment_tuple_spell(tuple2,cumdur+seql[mti]); /* duration */ enumeratetuples_spell(mti+1,finish,tuple2,seqs,seql,cumdur+seql[mti]); } return(0); } int increment_tuple_spell(unsigned long int tuple_id, unsigned long int duration) { struct my_struct *s; char buf[80]; HASH_FIND(hh, tupleshash, &tuple_id, sizeof(unsigned long int), s ); if (s) { s->count+=duration; } else { s = malloc(sizeof(struct my_struct)); if (s == NULL) { snprintf(buf,80,"Hash malloc problem----------------------------------------------------------\n"); SF_error(buf); return(-1); } s->id = tuple_id; s->count = duration; HASH_ADD(hh, tupleshash, id, sizeof(unsigned long int), s ); } return(1); } int id_sort(struct my_struct *a, struct my_struct *b) { return((a->id > b->id ? 1 : (a->id == b->id ? 0 : -1) )); } void sort_by_id() { HASH_SORT(tupleshash, id_sort); } /* /\* Suggested at http://www.gnu.org/software/libc/manual/html_node/Malloc-Examples.html *\/ */ /* void * xmalloc (size_t size) { */ /* register void *value = malloc (size); */ /* if (value == 0) */ /* fatal ("virtual memory exhausted"); */ /* return value; */ /* } */ STDLL stata_call(int argc, char *argv[]) { char buf[80]; int resulttype; int i, j; int nobs,length; double x1; int seqs[MAXLENGTH]; int seql[MAXLENGTH]; long nobstuples; double sxx; ST_retcode rc; double sxy, similarity, distance; unsigned long int k1, k2, maxcases, maxtuples; /* FILE *debugfile; */ /* debugfile = fopen("debugfile","w+"); */ nstates = atoi(argv[1]); if (argc>1) { show_work = atoi(argv[2]); } else { show_work = 0; } if (argc>2) { resulttype = atoi(argv[3]); } else { resulttype = 1; } if (argc>3) { maxcases = atoi(argv[4]) + 1; /* needs to be 1 bigger than _N because SF_vdata does for(i=1;i<=nobs...) */ } else { maxcases = 1000; } if (argc>4) { maxtuples = atoi(argv[5]) + 1; } else { maxtuples = 40000; } double t_sxx[maxcases]; /* double *t_sxx = malloc(maxcases*sizeof(double *)); */ /* move here to see if fixes 2nd run problem Oct 17 2011 23:56:56 */ size_t st = maxcases*sizeof(long *); long int **t_index = malloc(st); long int **t_count = malloc(st); for (i=0; imaxcases) { snprintf(buf, 80, "Error 3: n-cases (%5d) exceeds maximum (%lu)\n", nobs, maxcases); SF_display(buf); return(100); } /* fprintf(debugfile,"entup1\n"); */ /* fflush(debugfile); */ snprintf(buf,80,"Reading data and enumerating tuples\n"); SF_display(buf); for (i=1;i<=nobs;i++) { SF_vdata(2, i, &x1); length = (int)x1; for (j=1; j<=length; j++) { rc = SF_vdata((j-1)*2+3, i, &x1); if (rc) {return(rc);} if (x1<1 || x1>nstates) { snprintf(buf,80,"Error: case %d state out of range: %5.2f\n",j,x1); SF_display(buf); return(1); } seqs[j]=(int)x1; /* state */ rc = SF_vdata((j-1)*2+4, i, &x1); if (x1<1) { snprintf(buf,80,"Error: case %d duration out of range: %5.2f\n",j,x1); SF_display(buf); return(1); } if (rc) {return(rc);} seql[j]=(int)x1; /* duration */ } /* fprintf(debugfile,"entup2\n"); */ /* fflush(debugfile); */ tupleshash = NULL; if (show_work) { snprintf(buf,80,"I: %d\n", i); SF_display(buf); } enumeratetuples_spell(1,length,0,seqs,seql,0); /* Was 1, should start cum duration at 0? Nov 10 2011 21:46:22 */ sort_by_id(); nobstuples = 0; sxx = 0; for(s=tupleshash; s != NULL; s=s->hh.next) { nobstuples++; if (show_work) { snprintf(buf,80,"TPH: %d %lu %lu\n", i, s->id,s->count); SF_display(buf); } t_index[i][nobstuples]=s->id; t_count[i][nobstuples]=s->count; sxx+= (double)s->count*(double)s->count; HASH_DEL(tupleshash, s); free(s); } if (nobstuples>=maxtuples) { snprintf(buf, 80, "Error 2: case%5d: tuples observed %lu\n", i, nobstuples); SF_display(buf); return(100); } t_sxx[i]=sxx; } /* Pairwise processing */ snprintf(buf,80,"Pairwise processing\n"); SF_display(buf); for (i=1; i<=nobs; i++) { switch (resulttype) { case 1: SF_mat_store(argv[0],i,i,1.0); /* Diagonal of 1s for similarity matrix */ break; case 2: SF_mat_store(argv[0],i,i,0.0); /* Diagonal of 0s for distance matrix */ break; case 3: SF_mat_store(argv[0],i,i,(double)t_sxx[i]); /* Diagonal of SXX for SXY matrix */ break; default: snprintf(buf,80,"Error: result type not legitimate\n"); SF_display(buf); return(1); break; } for (j=i+1; j<=nobs; j++) { sxy=0; k1=1; k2=1; while ((t_index[i][k1]>0) & (t_index[j][k2]>0)) { if (t_index[i][k1]==t_index[j][k2]) { sxy = sxy + (double)t_count[i][k1]*(double)t_count[j][k2]; if (show_work) { /* snprintf(buf,80,"STUFF: %d %d %ld %Lf %Lf %Lf\n", */ /* i, j, */ /* t_index[i][k1], */ /* (double)t_count[i][k1], */ /* (double)t_count[j][k2], */ /* (double)sxy ); */ snprintf(buf,80,"STUFF: %d %d %g\n", i,j,sxy ); SF_display(buf); } k1++; k2++; } if (t_index[i][k1]t_index[j][k2]) { k2++; } } similarity = sxy/sqrt((double)t_sxx[i]*(double)t_sxx[j]); if (similarity>1.0) { snprintf(buf,80,"Faulty similarity value: %d %d %5.3f\n", i, j, similarity); SF_display(buf); } distance = sqrt( (double)t_sxx[i] + (double)t_sxx[j] - 2*sxy ) ; if (distance < 0.0) { snprintf(buf,80,"Faulty distance value: %d %d %5.3f\n", i, j, distance); SF_display(buf); } switch (resulttype) { case 1: SF_mat_store(argv[0],i,j,similarity); SF_mat_store(argv[0],j,i,similarity); break; case 2: SF_mat_store(argv[0],i,j,distance); SF_mat_store(argv[0],j,i,distance); break; case 3: SF_mat_store(argv[0],i,j,sxy); SF_mat_store(argv[0],j,i,sxy); break; /* result type checked already */ } } } for (i=0; i < maxcases; i++) { free((void *)t_index[i]); free((void *)t_count[i]); } free((void *)t_index); free((void *)t_count); /* free((void *)t_sxx); */ return(0); }