/* rind.c * This program is copyright 1996 by the Regents of the University of California * All rights reserved. * This version is intended for non-commercial use only. * This program written by Bill Bruno.*/ /* "Release 4" added virtual counts to deal with gamma dependence */ /* "Release 5" applies EM to branch lengths as well as probs */ /* "Release 6" tweeked for quicker convergence and uses 2 treescripts*/ /* "Release 6b" changed treescript choice and fixed 2 sequence bug*/ /* "Release 6c" allow local tree topology changes*/ /* "Release 6.4" make file i/o more robust, other user-friendly features */ /* "Release 6.4.1" fixed problem with names of length 10*/ /* "Release 6.4.2" add -v option*/ /* "Release 6.4.3" changed default gap - from J to X; detect spaces in names; made PSCOUNT work; recommend value .65 */ /* "Release 6.4.4" change criterion for topology change attempt */ /* "Release 6.4.5" allow -P option */ /* "Release 6.5" put branchlengths in natural units; added exponential prior on distances; change default to weighbor, -s for script */ /* "Release 6.9" corrected Selfchange for all-gap positions */ # include # include # include # include #define MaxNoSeq 1001 /* maximum number of sequences CHECK THIS VALUE*/ #define MaxLengthSeq 20001 /* maximum length of alignment CHECK THIS VALUE*/ /* It's ok if these are bigger than you need; but can't have spaces in data, only returns between sequences*/ double Pscount= 0.0; /* pseudo-count used in oldprobupdate*/ /* 0.0 gives maximum likelihood; 1.0 is uniform prior */ /* note: likelihood values ignore the posterior! */ /* .65 worked in Halpern & Bruno, 1998 */ #define FINALTOL .05 /* convergence criterion for counts (main loop)*/ #define COUNTTOL .000000005 /* convergence criterion for counts (modelupdate)*/ #define MAINITMAX 20 /*max iterations in converge (main loop)*/ #define MAXUPIT 100 /*max iterations in modelupdate and improvetree*/ #define MINTREEIMPIT 4 /*min iterations in improvetree before giving up*/ #define TREEMAX 6 /* >= 3*/ /*maximum number of trees saved*/ #define TREEMIN 3 /* don't throw away trees unless this many are left */ /*I recommend 3 or more; smaller values may cause trouble */ #define MINDIST 0.001 /* minimum allowed distance in initial tree */ /* 0 for MINDIST might be dangerous; use something < 1/(20*LengthSeq) */ /* if the tree has a branch length shorter than MINDIST when read from treefile, it is increased to MINDIST */ /* 6c : only branches shorter than MINDIST are candidates for local rearrangements */ #define MaxSlowSeqs 16 /* max seqs for which to use slowtreescript */ #define LTOL .05 /* likelihood convergence criterion for oldprobupdate*/ #define LIKECUTOFF 7.0 /* exp(-LIKECUTOFF) is not going to effect average*/ #define REALIMPROVEMENT 0.5 /* loglike improved enough to try new tree*/ #define BIGNO 1E+37 /* approximate infinity */ /* the following used only by bbisect and findzero */ #define EPS .0000000000000003 /* machine precision (?) */ #define TOL .000001 /* convergence criterion for bbisect*/ #define ITMAX 50 /*max iterations in bbisect*/ #define NAMELENGTH 10 /* number of characters max. in species name */ /*extern char *malloc();*/ void early_exit(); typedef char naym[NAMELENGTH]; typedef struct tstruct { double value; double error; } Double_with_error; typedef struct tdata { /*renormdata:*/ double pcharvect[26]; /*22 of these always needed*/ double *pnvect[26]; /* may only need one of these*/ double data[2*MaxNoSeq]; /* storage for pnvects, doubled for B and Z */ int maxn[26]; /* and one of these*/ double virtcts[26]; /*newly added for virtual counts; (caused by gamma)*/ double virtctsp[26]; /*derivative of virtcts*/ } Renormdata; typedef struct tnode *Treeptr; typedef struct tnode { /* node in tree */ int seqno; /* name of node, if any */ double length; /* length from mother to node */ double changes; /* expected changes along branch for EM */ Treeptr left; /* left daughter address, if any */ Treeptr right; /* right daughter address, if any */ Treeptr mother; /* mother address, if any */ char inc; /* 'y' or 'n' for 'included' */ char mcalc[26]; /* 'y' or 'n' for 'calculated' */ char rcalc[26]; /* 'y' or 'n' for 'calculated' */ char lcalc[26]; /* 'y' or 'n' for 'calculated' */ Renormdata mrdata; Renormdata rrdata; Renormdata lrdata; char side; /* whether node is l or r or m daughter */ } Treenode; char seq[MaxNoSeq][MaxLengthSeq]; /* store the sequences here*/ naym names[MaxNoSeq]; int LengthSeq, NoSeq; /* length and number of sequences read*/ int Iter; /* no of iterations*/ int Treeit; /* which tree we're on */ int NoTrees; /* how many trees right now */ int Debug; int Convgd; /* 1 means on last iteration */ char zerodist; /* whether two seqs have zero distance */ double oldprob[MaxLengthSeq][26], rateequil[MaxLengthSeq],Rateequil,Selfchange,Pscount; double Eff_Length; /* effective length of sequence (ignore invar & all gaps)*/ double Posloglike, Virtcts, Virtctsp; Double_with_error counts[MaxLengthSeq][26]; Double_with_error treecounts[TREEMAX][MaxLengthSeq][26]; Double_with_error probs[MaxLengthSeq][26]; double prevcounts[MaxLengthSeq][26]; /*counts in previous iteration*/ double oldcounts[MaxLengthSeq][26]; double virttreecounts[TREEMAX][MaxLengthSeq]; double virttreecountsp[TREEMAX][MaxLengthSeq]; double virtcounts[MaxLengthSeq]; double virtcountsp[MaxLengthSeq]; double entropy[MaxLengthSeq]; double loglikevect[TREEMAX]; double changenorms[2*MaxNoSeq-2]; Double_with_error answer; int Pos; char Char; int npzeros; /*for finzero*/ int Col; FILE *treefile; int Verbose,UserTree,TreeScript; /* option to specify using fixed tree */ Treeptr treeloc[TREEMAX][MaxNoSeq],Top[TREEMAX]; FILE *Fopen(name,mode) char *name, *mode; { FILE *tmp; tmp = fopen(name,mode); if(tmp == NULL) { fprintf(stderr, "could not open file "); perror(name); fprintf(stderr, "exiting \n"); early_exit(132); } return(tmp); } int comparechars(char1,char2) char char1,char2; { if(char1==char2) return(1); if(char1==' ' && char2=='_') return(1); if(char1=='_' && char2==' ') return(1); if('a'<= char1 && char1 <= 'z' && 'A'<=char2 && char2 <='Z' && (char1-char2) == 'a'-'A') return(1); if('a'<= char2 && char2 <= 'z' && 'A'<=char1 && char1 <='Z' && (char2-char1) == 'a'-'A') return(1); return(0); } int comparenames(name1,name2) char *name1, *name2; { int i1, i2; i1 = 0; i2 = 0; while((name1[i1] == ' ' || name1[i1]=='_') && i1 < NAMELENGTH-1) i1 ++; while((name2[i2] == ' ' || name2[i2]=='_') && i2 < NAMELENGTH-1) i2 ++; while(comparechars(name1[i1],name2[i2])==1 && i1 < NAMELENGTH-1 && i2 < NAMELENGTH-1 && name1[i1] != '\0' && name2[i2] != '\0') { i1 ++; i2 ++; } if(name1[i1] == '\0' || (i1 == NAMELENGTH-1 && (name1[i1] == ' ' || name1[i1] == '_'))) { while(i2 < NAMELENGTH-1 && (name2[i2]== ' ' || name2[i2]== '_')) i2 ++; if((i2 == NAMELENGTH-1 && (name2[i2]== ' ' || name2[i2] == '_' )) || name2[i2]== '\0') return(1); } if(name2[i2] == '\0' || (i2 == NAMELENGTH-1 && (name2[i2] == ' ' || name2[i2] == '_'))) { while(i1 < NAMELENGTH-1 && (name1[i1]== ' ' || name1[i1]== '_')) i1 ++; if((i1 == NAMELENGTH-1 && (name1[i1]== ' ' || name1[i1] == '_' )) || name1[i1]== '\0') return(1); } if((i1 == NAMELENGTH-1 || i2 == NAMELENGTH-1 ) && comparechars(name1[i1],name2[i2])==1) { return(1); } else { return(0); } } void readdata() { FILE *fp; char tchar; int tpos,tseq, scanret, error=0,j, ttseq, whitespace, whitespaceerror; int seqlen[MaxNoSeq], startpos; /* read sequences with no white space except carriage returns after each sequence */ /* NAMELENGTH character name followed by white space, then sequences */ if((fp=Fopen("data","r"))==NULL) { printf("could not open file 'data'\n"); error = 1; } for(tseq=0;tseq< MaxNoSeq;tseq++) seqlen[tseq]=0; LengthSeq=0; tseq=-1; scanret = 0; tchar = 'e'; while(scanret != EOF && tseq < MaxNoSeq) { tseq++; whitespace=0; whitespaceerror=0; for (j = 0; j < NAMELENGTH && scanret != EOF; j++) { tchar = getc(fp); if(j==0) { while(tchar == '\n') tchar = getc(fp); } if(whitespace==1 && !(tchar == ' ' || tchar == '\t' || tchar=='\n') ) { whitespaceerror=1; error=1; } if(tchar == ' ' || tchar == '\t' || tchar=='\n') whitespace=1; if(tchar != EOF) names[tseq][j] = tchar; else scanret = tchar; } tpos = -1; if(whitespaceerror==1) { fprintf(stderr,"Bad Name: contains internal whitespace\n"); fprintf(stderr,"sequence %d, %.10s\n",tseq,names[tseq]); whitespaceerror=0; } if(scanret != EOF) { ttseq = 0; while(ttseq< tseq &&comparenames(names[tseq],names[ttseq])==0) ttseq++; if(ttseq < tseq && ttseq > 0 && (seqlen[ttseq-1] <= seqlen[ttseq])) { printf("bad data file: either inadvertent name repeat in lines %d and %d\n",tseq+1,ttseq+1); printf("or change in order of sequences in different blocks\n"); early_exit(258); } tseq=ttseq; tpos = seqlen[tseq] -1; startpos = tpos+1; } if(Verbose>=1) printf("reading seq %d from pos %d\n",tseq,startpos); while(tpos25) || (tchar < 'A')) { if((tchar-'A' >25) || (tchar < 'A')) { printf("illegal character! %c should be A-Z\n",tchar); printf("tpos= %d, tseq=%d\n",tpos,tseq); error = 1; } } } /* if(tchar=='B' || tchar=='Z') { printf("illegal B or Z! %c not allowed\n",tchar); error = 1; } */ /* B and Z have special meanings */ /* B here means N or D; Z here means E or Q. However, in the probability vectors I store in B the probability of broken chain of inheritance, and in Z the total probability of all characters */ if(tchar=='O'|| tchar=='U') { printf("illegal O or U! %c never allowed\n",tchar); error = 1; } /* J now means a gap */ if((tpos == MaxLengthSeq) && (tchar != '\n')) { printf("sequence too long! MaxLengthSeq=%d\n", MaxLengthSeq); printf("tseq= %d\n",tseq); while(tchar != '\n' && scanret!=EOF) { scanret = fscanf(fp,"%c",&tchar); tpos++; } printf("this seq is %d positions long\n",tpos); error = 1; } if(tchar != '\n' && scanret != EOF) { seq[tseq][tpos]=tchar; seqlen[tseq]++; } } /* end while */ if(LengthSeq == 0) LengthSeq = tpos; if(tpos > LengthSeq && tseq==NoSeq) { fprintf(stderr,"warning: seq's not all same length\n"); NoSeq++; } if(tpos < LengthSeq && tpos > 0) { fprintf(stderr,"warning: seq's not all same length\n"); if(tseq==NoSeq) NoSeq ++; } if(tpos == LengthSeq) NoSeq ++; if((tseq == MaxNoSeq) && (scanret!= EOF)) { printf("too many sequences! MaxNoSeq= %d\n",MaxNoSeq); error = 1; } if(tchar == '\n') tchar = 'r'; } LengthSeq=seqlen[0]; for(tseq=0;tseqLengthSeq) LengthSeq=seqlen[tseq]; } for(tseq=0;tseq 1.0) { if(prob > 1.000001) { printf("probcheck: at %d prob= %f > 1\n",loc,prob); err = 1; } retval = 1.0; } if(prob < 0.0) { printf("probcheck: at %d prob= %f < 0\n",loc,prob); err = 1; retval = 0.0; } if(prob <= 1.000001 && prob >= 0.0 && err == 0) return retval; else { printf("probcheck: at %d prob = %f\n",loc,prob); err = 1; early_exit(425); exit(1); } } double logck(arg,loc) double arg; /*check arg is in bounds (allow some error) */ int loc; { double retval; int err=0; retval = arg; if(arg < 0.0) { printf("logcheck: error at %d arg= %f < 0\n",loc,arg); err = 1; } if(arg == 0.0) { printf("logcheck: warning at %d arg = 0\n",loc); retval = 1.0e-100; } if(arg >= 0.0 && err == 0) return retval; else { printf("logcheck: at %d arg = %f\n",loc,arg); early_exit(454); exit(1); } } double nonnegck(num,loc) /* check nonnegative */ double num; { if(num>=0.0) return(num); else { printf("%g should be >= 0! at %d\n",num,loc); early_exit(466); exit(1); } } double numck(x, loc) double x; int loc; { if(x>= 0.0||x< 0.0) return(x); else { printf("non-number at loc %d!\n", loc); return(x); } } double square(x) double x; { return(x*x); } void prevcountsinit() { int pos1, char1; for(pos1=0;pos1 1.0) { norm += rateequil[tpos]; pisq+= rateequil[tpos]*oldprob[tpos][0]*oldprob[tpos][0]; for(tchar=2;tchar<26;tchar++) /*skip 1; it equals 1.0 (not an a.a.)*/ { pisq+= rateequil[tpos]*oldprob[tpos][tchar]*oldprob[tpos][tchar]; } } } Selfchange = pisq/(Rateequil*Eff_Length); } void rateequilinit() { double maxp; int tpos, tchar; Rateequil=0.0; Eff_Length = 0.0; for(tpos=0;tposmaxp) { maxp = oldprob[tpos][tchar]; } } if(pck(maxp,6602) = 0.0 && tprobm <= 1.0) { tprobm = tprobm; } else { tprobm = 1.0; } tprob = tprobm*p; if(tchar==seq[seq2][tpos]) { sumval += (1.-pi)*gamma*tprobm/(tprob+(1-tprob)*pi); } else if((tchar == 'B' && (tchar2 == 'D' || tchar2 == 'N')) || (tchar == 'Z' && (tchar2 == 'E' || tchar2 == 'Q')) ) sumval += (1.-pi)*gamma*tprobm/(tprob+(1-tprob)*pi); else if((tchar2 == 'B' && (tchar == 'D' || tchar == 'N')) || (tchar2 == 'Z' && (tchar == 'E' || tchar == 'Q')) ) { if(tchar2 == 'B') /* pi = prob of char at saturation */ pi = oldprob[tpos]['N' - 'A'] + oldprob[tpos]['D' - 'A']; if(tchar2 == 'Z') pi = oldprob[tpos]['Q' - 'A'] + oldprob[tpos]['E' - 'A']; sumval += (1.-pi)*gamma*tprobm/(tprob+(1-tprob)*pi); } else /* sequences disagree */ { sumval -= gamma*tprobm/(1.-tprob); } } } return(sumval + 1/p); /*multiplying by p before differentiating the log is like using an exponential prior on d */ } double d2like(p,seq1,seq2) /* second derivaative */ double p; int seq1,seq2; { int tpos,tchar, tchar2; double sumval=0.0,tprob, tprobm,tprobm2,gamma, pi; for(tpos=0;tpos= 0.0) return(cx2); while(i TOL){ xnew = (cx1+cx2)*0.5; fnew = func(xnew,seq1,seq2); if(fnew==0.0) return(xnew); else if(fnew > 0.0) cx1 = xnew; else cx2= xnew; i++; } if(i==ITMAX) printf("too many its?: %d",i); return(0.5*(cx1+cx2)); } double findzero(tol,seq1,seq2,weightptr) double tol, *weightptr; int seq1,seq2; { double p=EPS, tlike, cross=0.0, ncross= EPS; tlike = dlike(p, seq1, seq2); while((p < .98) && tlike <= 0.0 && cross == 0.0) { p+= .02; tlike = dlike(p, seq1, seq2); if( -1.0 > tlike*p) { cross= p; } else ncross = p; } if(cross == 0.0) { p = bbisect(p,1.0-.5*tol,tol,seq1,seq2,dlike); /* printf("max like = %f\n",p); */ /* *weightptr = -d2like(p,seq1,seq2); removed for speed */ return(p); } else { /* infinite m.l. distance; must make finite estimate */ /* printf("pmood(ncross) = %lg pmood(cross)= %lg cross=%f\n", pmoodlike(ncross,seq1,seq2),pmoodlike(cross,seq1,seq2),cross); */ p = bbisect(ncross,cross,tol,seq1,seq2,pmoodlike); /* printf("max like = 0; using %lg\n",0.5*p); */ npzeros++; *weightptr = 4./(p*p); /* return(0.5*p); */ /* try something more "rigorous" */ /* if we integrate sqrt(2/pi*s^2) -log(p) e^(-p^2/2s^2) dp from 0 to infinity, (s is sigma, = root of pmoodlike above) and take exp(- result) we get s e^(-gamma/2)/sqrt(2) = .5298393548 s */ return(0.5298393548*p); } } void early_exit(loc) int loc; { printf("program NOT converged, exiting prematurely from line %d!\n",loc); fflush(stdout); fflush(stderr); exit(1); /* savecounts(); calcprobs(); printprobs(); printmodel(); calcentropy(); printent(); if(system("mv besttree oldbesttree")!= 0) printf("could not mv besttree!\n"); printbesttree(); exit(1); */ } int writedistances() /* calculate and write distances*/ { int seq1, seq2, j; double p,d,weight,dist[MaxNoSeq][MaxNoSeq]; FILE *fp; /* FILE *fpe; int k; */ zerodist = 0; if((fp=Fopen("infile","w")) == NULL) { printf("can't open infile for writing"); return(1); } /* weightfile removed for speed if((fpe=Fopen("weightfile","w")) == NULL) { printf("can't open weightfile for writing"); return(1); } */ fprintf(fp," %d\n",NoSeq); npzeros = 0; for(seq1=0;seq1=seq1) { p=pck(findzero(.0000001,seq1,seq2,&weight),5); d=log(logck(1.0/p,5)); dist[seq1][seq2] = d; if(d==0.0 && seq2>seq1) zerodist = 1; } else d= dist[seq2][seq1]; fprintf(fp,"%f ",d*Rateequil*(1.0-Selfchange)); /* fprintf(fpe,"%f ",weight*p*p); removed for speed*/ } fprintf(fp,"\n"); } if(npzeros > 0) printf("%d prs w/ inf. dist,",npzeros); fclose(fp); /*fclose(fpe);*/ return(0); } /* -------- ReadTree ------------------------ * */ Treeptr allocatetree() { Treenode *top; if(NoSeq > 3) { if((top = (Treenode *) malloc((2*NoSeq-2)*sizeof(Treenode)))==NULL) { printf("allocatetree: can't allocate tree memory for tree %d\n", NoTrees+1); printf("will use %d trees\n",NoTrees); } } else { if((top = (Treenode *) malloc((2*3-2)*sizeof(Treenode)))==NULL) { printf("allocatetree: can't allocate tree memory for tree %d\n", NoTrees+1); printf("will use %d trees\n",NoTrees); } } return(top); } int ReadTree( top, in_fp ) Treeptr top; FILE *in_fp; { Treenode *here; char a, punct[2],name[NAMELENGTH+1]; int seqno, count, memcount, numneg=0, j, tseqno, found[MaxNoSeq]; double worstneg=0.0; for(j=0;j= NoSeq) (*treeloc[Treeit][seqno]).inc = 'n'; if (punct[0] == ',') { here = (*(*here).mother).right; break; } else { if (punct[0] == ':') { if (fscanf(in_fp,"%lf", &((*here).length)) != 1) top = NULL; (*here).length /= (Rateequil*(1.0-Selfchange)); if((*here).length < MINDIST) { if((*here).length < worstneg) worstneg = (*here).length; (*here).length = MINDIST; numneg ++; } break; } else { printf("after name=%s:\n",name); printf("problem with punctuation--punct[0]=%c:\n",punct[0]); printf("treefile ends prematurely?"); top = NULL; break; } } } } if(NoSeq >= 3 && memcount != 2*NoSeq-2) { printf("did not fill tree!"); printf("seqno = %d, memcount = %d",seqno,memcount); top=NULL; } if(NoSeq < 3 && memcount != 4) { printf("did not fill tree!"); printf("seqno = %d, memcount = %d",seqno,memcount); top=NULL; } if (top != here) { printf("error: tree did not parse\n"); if(top == NULL) printf("top == Null\n"); printf("top = %d, here = %d\n",(*top).seqno,(*here).seqno); top = NULL; } else if((*here).side != 'm') { printf("error: tree did not parse--should be unrooted\n"); } (*here).length = (*(*here).mother).length; if(numneg > 0) printf(" %d lengths<%g, worst: %6g,",numneg, MINDIST,worstneg); for(j=0;j 2*NoSeq) { printf("vrenorm: index out of bounds\n"); printf("maxn=%d, NoSeq=%d",(*ndata).maxn['Z'-'A'],NoSeq); early_exit(1319); } (*ndata).maxn[Char-'A'] = maxn; if(maxn1>0) pnvect1 = ((*ndata1).pnvect)[Char-'A']; if(maxn2>0) pnvect2 = ((*ndata2).pnvect)[Char-'A']; if(maxn>0) pnvectret = (*ndata).data + (*ndata).maxn['Z'-'A']; if(maxn>0) ((*ndata).pnvect)[Char-'A']=pnvectret; gamma = rateequil[Pos]; /* rate of equilibration*/ p1 = pck(exp(-gamma*length1),-1); p2 = pck(exp(-gamma*length2),-2); omp1 = -expm1(-gamma*length1); /* 1-p1 */ omp2 = -expm1(-gamma*length2); /* 1-p2 */ if(omp1 > 0) { negvirtct1 = -gamma*length1*p1/omp1; negvirtctp1 = (gamma*length1*p1*(omp1-gamma*length1)/omp1)/omp1; } else { negvirtct1 = - 1.0; negvirtctp1 = - 0.0; } if(omp2 > 0) { negvirtct2 = -gamma*length2*p2/omp2; negvirtctp2 = (gamma*length2*p2*(omp2-gamma*length2)/omp2)/omp2; } else { negvirtct2 = - 1.0; negvirtctp2 = - 0.0; } /* these have been normalized to 1.0 so we will get rid of them : sum1 = 1.0; sum2 = 1.0; */ if(calc == 'n') /*not yet calculated pcharvect*/ { for(char1=0;char1<25;char1++) { tmpa = 0.0; tmpb = 0.0; if(char1==1) /* this is where we keep uninherited characters*/ { tmpa = (omp1)*(omp2); tmpb = -2.0*p1*p2*pcharvect1[1]*pcharvect2[1]; } if(pcharvect1[char1] > 0.0 || pcharvect2[char1] > 0.0 || tmpa>0.0) { pcharvectret[char1] = nonnegck(tmpa + tmpb + p1*pcharvect1[char1]*p2*pcharvect2[char1]/oldprob[Pos][char1] + p1*pcharvect1[char1]*(omp2+p2*pcharvect2[1]) + p2*pcharvect2[char1]*(omp1+p1*pcharvect1[1]),101) ; virtctsret[char1] = tmpa* (virtcts1[25]+virtcts2[25]+negvirtct1+negvirtct2) + tmpb* (virtcts1[1]+virtcts2[1]+gamma*(length1+length2)) + p1*pcharvect1[char1]*p2*pcharvect2[char1]/oldprob[Pos][char1]* (virtcts1[char1]+virtcts2[char1]+gamma*(length1+length2)) + p1*pcharvect1[char1]*(omp2* (virtcts1[char1]+virtcts2[25]+gamma*length1+negvirtct2)+ p2*pcharvect2[1]* (virtcts1[char1]+virtcts2[1] +gamma*(length1+length2)) ) + p2*pcharvect2[char1]*(omp1* (virtcts2[char1]+virtcts1[25]+gamma*length2+negvirtct1)+ p1*pcharvect1[1]* (virtcts2[char1]+virtcts1[1] +gamma*(length1+length2)) ) ; virtctspret[char1] = tmpa* (square(virtcts1[25]+virtcts2[25]+negvirtct1+negvirtct2) + (virtctsp1[25]+virtctsp2[25]+negvirtctp1+negvirtctp2)) + tmpb* (square(virtcts1[1]+virtcts2[1]+gamma*(length1+length2)) + (virtctsp1[1]+virtctsp2[1]-gamma*(length1+length2))) + p1*pcharvect1[char1]*p2*pcharvect2[char1]/oldprob[Pos][char1]* (square(virtcts1[char1]+virtcts2[char1]+gamma*(length1+length2)) + (virtctsp1[char1]+virtctsp2[char1]-gamma*(length1+length2))) + p1*pcharvect1[char1]*omp2* (square(virtcts1[char1]+virtcts2[25]+gamma*length1+negvirtct2) + (virtctsp1[char1]+virtctsp2[25]-gamma*length1+negvirtctp2)) + p1*pcharvect1[char1]*p2*pcharvect2[1]* (square(virtcts1[char1]+virtcts2[1] +gamma*(length1+length2)) + (virtctsp1[char1]+virtctsp2[1]-gamma*(length1+length2))) + p2*pcharvect2[char1]*omp1* (square(virtcts2[char1]+virtcts1[25]+gamma*length2+negvirtct1)+ (virtctsp2[char1]+virtctsp1[25]-gamma*length2+negvirtctp1)) + p2*pcharvect2[char1]*p1*pcharvect1[1]* (square(virtcts2[char1]+virtcts1[1] +gamma*(length1+length2))+ (virtctsp2[char1]+virtctsp1[1]-gamma*(length1+length2))); } else { pcharvectret[char1] = 0.0; virtctsret[char1] = 0.0; virtctspret[char1] = 0.0; } } sum = 0.0; virtctsret[25] = 0.0; virtctspret[25] = 0.0; for(char1=0;char1<25;char1++) { sum += pcharvectret[char1]; virtctsret[25] += virtctsret[char1]; virtctspret[25] += virtctspret[char1]; } if(sum ==0.0) { retval=-1; printf("vrenorm: impossible tree!\n"); } for(char1=0;char1<25;char1++) { if(pcharvectret[char1] > 0.0) { virtctsret[char1] /= pcharvectret[char1]; virtctspret[char1] /= pcharvectret[char1]; virtctspret[char1] -= square(virtctsret[char1]); } else { virtctsret[char1] = 0.0; virtctspret[char1] = 0.0; } pcharvectret[char1] /= sum; } pcharvectret[25] = log(logck(sum,251))+pcharvect1[25]+pcharvect2[25]; virtctsret[25] /= sum; virtctspret[25] /= sum; virtctspret[25] -= square(virtctsret[25]); if(pcharvectret[25] > 0.000001) printf("warning: likelihood > 1 (log=%g)\n", pcharvectret[25]); } for(i=0;i 0 && tmp > 0.0) { tmp = pcharvectret[Char-'A']/tmp; for(i=0;i 2*NoSeq) { printf("srenorm: index out of bounds\n"); early_exit(1544); } (*ndata).maxn[Char-'A'] = maxn; if(maxn1>0 && !(((*ndata1).pnvect[Char-'A'] < (*ndata1).data+2*NoSeq) && ((*ndata1).pnvect[Char-'A'] >= (*ndata1).data))) { fprintf(stderr,"srenorm error:\n"); fprintf(stderr,"problem passing pointer, or pnvect out of bounds!\n"); printf("Char = %c\n",Char); early_exit(1554); } if(maxn1>0) pnvect1 = (*ndata1).pnvect[Char-'A']; gamma = rateequil[Pos]; /* rate of equilibration= # of codons*/ p1 = pck(exp(-gamma*length1),-1); omp1 = -expm1(-gamma*length1); /* 1-p1 */ if(gamma*length1 > 0) { negvirtct1 = -gamma*length1*p1/omp1; negvirtctp1 = gamma*length1*p1*(omp1-gamma*length1)/(omp1*omp1); } else { negvirtct1 = - 1.0; negvirtctp1 = - 0.0; } sum1 = 1.0; if(calc == 'n') /*not yet calculated pcharvect*/ { for(char1='A';char1<='Z';char1++) { pcharvectret[char1-'A'] = p1*pcharvect1[char1-'A']; virtctsret[char1-'A'] = (virtcts1[char1-'A']+gamma*length1); virtctspret[char1-'A'] = (virtctsp1[char1-'A']-gamma*length1);/*squares cancel*/ } pcharvectret[1] += (omp1)*sum1; /* this is where we keep uninherited ch*/ virtctsret[1] *= p1*pcharvect1[char1-'A']; virtctsret[1] += omp1*sum1*(virtcts1[25]+negvirtct1); if(pcharvect1[1] > 0.0) virtctsret[1] /= pcharvect1[char1-'A']; virtctspret[1] *= p1*pcharvect1[char1-'A']; virtctspret[1] += omp1*sum1* (square(virtcts1[25]+negvirtct1) + (virtctsp1[25]+negvirtctp1)); if(pcharvect1[1] > 0.0) virtctspret[1] /= pcharvectret[1]; virtctspret[1] -= square(virtctsret[1]); pcharvectret[25] = pcharvect1[25]; virtctsret[25] = virtcts1[25]; virtctspret[25] = virtctsp1[25]; } for(i=0;i .00001) { printf("error in srenorm\n"); printf("tmp= %g pcharvret = %g\n",tmp, pcharvectret[Char-'A']); printf("Pos= %d, Char= %c, maxn= %d\n",Pos,Char,maxn); early_exit(1618); } return(retval); } void updateleaf(nodeptr) Treeptr nodeptr; { int i, seqno, *maxn; double *pcharvectret, sum=0.0; Renormdata *ndata; ndata = &((*nodeptr).mrdata); seqno=(*nodeptr).seqno; pcharvectret = (*ndata).pcharvect; maxn = (*ndata).maxn; if((*nodeptr).mcalc['Z'-'A']!='y') { (*nodeptr).mcalc['Z'-'A'] = 'y'; for(i=0;i<25;i++) { pcharvectret[i] = 0.0; (*ndata).virtcts[i] = 0.0; (*ndata).virtctsp[i] = 0.0; } (*ndata).virtcts[25] =0.0; (*ndata).virtctsp[25] =0.0; if((*nodeptr).inc == 'y') { if(seq[seqno][Pos] != 'B' && seq[seqno][Pos] != 'Z') { if(oldprob[Pos][seq[seqno][Pos]-'A']==0.0) printf("warning: zero prob (char = %c)\n",seq[seqno][Pos]); pcharvectret[25] = log(logck(pck(oldprob[Pos][seq[seqno][Pos]-'A'],25),25)); pcharvectret[seq[seqno][Pos]-'A'] = 1.0; } if(seq[seqno][Pos] == 'B') { sum = pck(oldprob[Pos]['N'-'A'] + oldprob[Pos]['D'-'A'],26); pcharvectret['N'-'A'] = oldprob[Pos]['N'-'A']/sum; pcharvectret['D'-'A'] = oldprob[Pos]['D'-'A']/sum; pcharvectret[25] = log(logck(sum,265)); } if(seq[seqno][Pos] == 'Z') { sum = pck(oldprob[Pos]['Q'-'A'] + oldprob[Pos]['E'-'A'],27); pcharvectret['Q'-'A'] = oldprob[Pos]['Q'-'A']/sum; pcharvectret['E'-'A'] = oldprob[Pos]['E'-'A']/sum; pcharvectret[25] = log(logck(sum,275)); } } else /* 'x' character or some such uninformative case */ { pcharvectret[1] = 1.0; pcharvectret[25] = 0.0; } } if((*nodeptr).mcalc[Char-'A']!='y'&& (*nodeptr).inc=='y') { (*nodeptr).mcalc[Char-'A']='y'; if(pcharvectret[Char-'A'] > 0.0) { (*ndata).pnvect[Char-'A'] = (*ndata).data + maxn['Z'-'A']; maxn['Z'-'A'] += 1; maxn[Char-'A'] = 1; (*ndata).pnvect[Char-'A'][0]=pcharvectret[Char-'A']; } else maxn[Char-'A'] = 0; } if((*nodeptr).inc == 'y' && pcharvectret[Char-'A'] > 0.0 && (!(((*ndata).pnvect[Char-'A'] < (*ndata).data+2*NoSeq) && ((*ndata).pnvect[Char-'A'] >= (*ndata).data))) ) { fprintf(stderr, "updateleaf: problem passing pointer, or pnvect out of bounds!\n"); /* fprintf(stderr, "Pos= %d, Char = %c, maxn= %d, pnvect[char]= %g data = %p\n", Pos, Char, maxn['Z'-'A'], *(ndata->pnvect+(Char-'A')), ndata->data); */ fprintf(stderr, "maxn[char]= %d, pcharvectret[char] = %g, seqno=%d\n", maxn[Char-'A'],pcharvectret[Char-'A'],seqno); early_exit(1714); } } Renormdata *finddata(nodeptr,dir) Treeptr nodeptr; char dir; { switch (dir) { case 'm': return(&((*nodeptr).mrdata)); break; case 'l': return(&((*nodeptr).lrdata)); break; case 'r': return(&((*nodeptr).rrdata)); break; } printf("unknown direction!\n"); exit(1); } int nodeupdate(nodeptr,dir) Treeptr nodeptr; char dir; { double length1, length2; int seqno, retval; Treeptr nodeptr1, nodeptr2; char dir1, dir2, calc; Renormdata *ndata, *ndata1, *ndata2; double p1,p2,gamma; retval = 0; if(nodeptr == NULL) { printf("at empty node!"); return(1); } /*printf("at node: %d, dir= %c inc= %c \n",(*nodeptr).seqno,dir,(*nodeptr).inc, *maxnptr); */ seqno = (*nodeptr).seqno; if(seqno >= 0) { /* at a leaf */ if(dir!='m') { printf("error:leaves have no children!"); return(-1); } updateleaf(nodeptr); } else { switch (dir) { case 'm': if((*nodeptr).mcalc[Char-'A'] == 'y') { return(retval); } else { (*nodeptr).mcalc[Char-'A'] = 'y'; calc = (*nodeptr).mcalc[25]; (*nodeptr).mcalc[25] = 'y'; nodeptr1 = (*nodeptr).left; dir1 = 'm'; nodeptr2 = (*nodeptr).right; dir2 = 'm'; } break; case 'l': if((*nodeptr).lcalc[Char-'A'] == 'y') { return(retval); } else { (*nodeptr).lcalc[Char-'A'] = 'y'; calc = (*nodeptr).lcalc[25]; (*nodeptr).lcalc[25] = 'y'; nodeptr1 = (*nodeptr).mother; dir1 = (*nodeptr).side; nodeptr2 = (*nodeptr).right; dir2 = 'm'; } break; case 'r': if((*nodeptr).rcalc[Char-'A'] == 'y') { return(retval); } else { (*nodeptr).rcalc[Char-'A'] = 'y'; calc = (*nodeptr).rcalc[25]; (*nodeptr).rcalc[25] = 'y'; nodeptr1 = (*nodeptr).mother; dir1 = (*nodeptr).side; nodeptr2 = (*nodeptr).left; dir2 = 'm'; } break; default: printf("nrenorm: unknown side-type"); break; } if((ndata = finddata(nodeptr,dir))==NULL) printf("nodeupdate: bad dir"); if((ndata1 = finddata(nodeptr1,dir1))==NULL) { printf("nodeupdate: bad dir1"); early_exit(1833); } if((ndata2 = finddata(nodeptr2,dir2))==NULL) printf("nodeupdate: bad dir2"); if(nodeupdate(nodeptr1,dir1)<0) { printf("bad nodeupdate1"); retval = -1; } if(nodeupdate(nodeptr2,dir2)<0) { printf("bad nodeupdate2"); retval = -1; } if(nodeptr1 == (*nodeptr).mother) { length1 = (*nodeptr).length; } else { length1 = (*nodeptr1).length; } if(nodeptr2 == (*nodeptr).mother) { length2 = (*nodeptr).length; } else { length2 = (*nodeptr2).length; } if(vrenorm(length1, length2,ndata1,ndata2,ndata,calc) < 0) { gamma = rateequil[Pos]; p1 = pck(exp(-gamma*length1),-1); p2 = pck(exp(-gamma*length2),-2); printf("impossible tree! nodes %d, %d, %d\n", (*nodeptr).seqno,(*nodeptr1).seqno,(*nodeptr2).seqno); printf("length1=%g, length2=%g, Pos= %d\n",length1,length2,Pos); printf("p1= %g, p2= %g gamma= %g\n",p1,p2,gamma); early_exit(1879); } (*nodeptr).inc = 'y'; } if(retval==0 && (*nodeptr).inc != 'y') retval=1; return(retval); } Double_with_error *porig(nodeptr,dir) Treeptr nodeptr; char dir; { Treeptr nodeptr1; double p1, omp1, *pcharvect1, *pnvect1, length1, sum1, tmp, errsum, anssum,gamma; double *virtcts1; double *virtctsp1; double negvirtct1, negvirtctp1; char dir1; int maxn1, i; Renormdata *ndata1; switch (dir) { case 'm': nodeptr1 = (*nodeptr).mother; dir1 = (*nodeptr).side; break; /* case 'l': nodeptr1 = (*nodeptr).left; dir1 = 'm'; break; case 'r': nodeptr1 = (*nodeptr).right; dir1 = 'm'; break; these don't apply*/ default: printf("porig: unknown side-type"); break; } if(nodeptr1 == (*nodeptr).mother) length1 = (*nodeptr).length; else length1 = (*nodeptr1).length; /*printf("start at %d %c\n",(*nodeptr1).seqno,dir1);*/ gamma = rateequil[Pos]; /* rate of equilibration*/ if(nodeupdate(nodeptr1,dir1)==0) { ndata1 = finddata(nodeptr1,dir1); pcharvect1 = (*ndata1).pcharvect; maxn1 = (*ndata1).maxn[Char-'A']; if(maxn1>0) pnvect1 = (*ndata1).pnvect[Char-'A']; virtcts1 = (*ndata1).virtcts; virtctsp1 = (*ndata1).virtctsp; p1 = pck(exp(-gamma*length1),77); omp1 = -expm1(-gamma*length1); /* 1-p1 */ if(gamma*length1 > 0) { negvirtct1 = -gamma*length1*p1/omp1; negvirtctp1 = gamma*length1*p1*(omp1-gamma*length1)/(omp1*omp1); } else { negvirtct1 = - 1.0; negvirtctp1 = - 0.0; } /* sum1 is not a good name, but: */ sum1 = (omp1)*oldprob[Pos][Char-'A'] /* pcharvect1[] sum to 1.0 */ + pcharvect1[1]*p1*oldprob[Pos][Char-'A']; anssum = pck(sum1,77); /* character unshared by inheritance*/ Virtcts = omp1* (virtcts1[25]+negvirtct1) + p1*pcharvect1[1]* (virtcts1[1]+gamma*length1) + (p1*pcharvect1[Char-'A']/oldprob[Pos][Char-'A'])* (virtcts1[Char-'A']+gamma*length1); Virtcts /= (omp1+p1*pcharvect1[1]+p1*pcharvect1[Char-'A']/ oldprob[Pos][Char-'A']); Virtctsp = omp1* (square(virtcts1[25]+negvirtct1) + virtctsp1[25]+negvirtctp1) + p1*pcharvect1[1]* (square(virtcts1[1]+gamma*length1)+ (virtctsp1[1]-gamma*length1)) + (p1*pcharvect1[Char-'A']/oldprob[Pos][Char-'A'])* (square(virtcts1[Char-'A']+gamma*length1)+ (virtctsp1[Char-'A']-gamma*length1)); Virtctsp /= (omp1+p1*pcharvect1[1]+p1*pcharvect1[Char-'A']/ oldprob[Pos][Char-'A']); Virtctsp -= square(Virtcts); sum1 += pck(pcharvect1[Char-'A']*p1,44); Posloglike = log(logck(pck(sum1,66),66)) + pcharvect1[25]; if(Posloglike > 0.0000001) { printf("error: likelihood greater than 1! = %g\n",Posloglike); } if(!(Virtctsp <= 0.0 || Virtctsp > 0.0)) { printf("ill-defined Virtctsp = %g\n",Virtctsp); printf("Virtcts = %g, p1= %g omp1= %g length1=%g\n", Virtcts, p1, omp1, length1); printf("virtctsp1[25]=%g,pcharvect1[1]=%g", virtctsp1[25], pcharvect1[1]); } if(sum1 == 0.0) { printf("zero likelihood tree, sum1 = 0.0\n"); printf("Pos = %d, Char = %c, p1 = %f, oldprob = %f, pchar[25] = %f\n", Pos, Char, p1, oldprob[Pos][Char-'A'],pcharvect1[25]); } if(p1==1.0 && sum1 == 0.0) { printf("at node %d:",(*nodeptr).seqno); printf(" zero likelihood at pos=%d\n",Pos); } anssum = pck(anssum/sum1,55); /* normalize */ errsum = anssum*(1.0-anssum); for(i=0;i=0.0)) printf("negative? %g ans.val = %g sum1= %g errsum = %g maxn1=%d\n", errsum, answer.value, sum1, errsum,maxn1); answer.error = nonnegck(errsum); return(&answer); } else { /* printf("no active sequences\n");*/ answer.value = 1.0; answer.error = 0.0; return(&answer); } } void cleartreedata() { int i,j,maxi; if(NoSeq >=3) maxi = 2*NoSeq-2; else maxi = 4; for(i=0;i=3) maxi = 2*NoSeq-2; else maxi = 4; for(i=0;i=3) maxi = 2*NoSeq-2; else maxi = 4; for(i=0;i 0.0 ) { if(oldprob[Pos][j]==0.0) { printf("zero prob char at 760: pos %d char %c pchr %g\n", Pos,'A'+j,pcharvect1[j]); early_exit(2154); } pnochange += nonnegck(pcharvect1[j]*pcharvect2[j]/oldprob[Pos][j],760); } } pchange = pck((1.0 - pcharvect1[1])*(1.0-pcharvect2[1]),761); pdontcare = pck(pcharvect1[1]+pcharvect2[1] - pcharvect1[1]*pcharvect2[1],762); tmp = pchange*(-expm1(-gamma*length))+ pnochange*exp(-gamma*length); if(tmp + pdontcare <=0.0) { fprintf(stderr,"impossible tree!? p= %g\n", pchange*(-expm1(-gamma*length))+ pnochange*exp(-gamma*length)); printf("pchange=%g nochange=%g gamma*length= %g\n", pchange, pnochange, gamma*length); printf("-expm1(-gamm*l)= %g ,exp(-gam*l)= %g\n", -expm1(-gamma*length), exp(-gamma*length)); printf("p dont care = %g",pdontcare); fflush(stdout); early_exit(2175); } /* following is true EM: */ /* (*node1).changes += nonnegck((pchange+pdontcare)*gamma*length/ (pchange*(-expm1(-gamma*length))+ pnochange*exp(-gamma*length)+ pdontcare),770); changenorms[i] += gamma; */ /* following hopefully converges faster */ /* if(pchange>pnochange) rdontcaremin = pdontcare/(pchange+pdontcare); else rdontcaremin = pdontcare/(pnochange+pdontcare); (*node1).changes += (pchange+pdontcare)*gamma*length/ (pchange*(-expm1(-gamma*length))+ pnochange*exp(-gamma*length)+ pdontcare)-rdontcaremin*gamma*length ; changenorms[i] += gamma*(1.0-rdontcaremin) ; */ /* even faster? */ (*node1).changes += nonnegck(pchange*gamma*length/ (pchange*(-expm1(-gamma*length))+ pnochange*exp(-gamma*length)+ pdontcare),770); changenorms[i] += nonnegck(gamma* (pchange*(-expm1(-gamma*length))+ pnochange*exp(-gamma*length))/ (pchange*(-expm1(-gamma*length))+ pnochange*exp(-gamma*length)+ pdontcare),770); } } } } void poscountsupdate() { int pos1, i,k; char char1; double norm, p,dp; double p1,dp1,p2,dp2,poslike1,poslike2; double virtcts1, virtcts2; double virtctsp1, virtctsp2; Double_with_error *tporig, newcounts[26]; double gamma; pos1 = Pos; gamma = rateequil[pos1]; cleartreedata(); norm = 20.0*Pscount; /*pseudocount*/ for(char1='A';char1<='Z';char1++) { newcounts[char1-'A'].value = Pscount; newcounts[char1-'A'].error = 0.0; } for(i=0;i poslike2) { Posloglike = poslike1 + log(logck(1.0+exp(poslike2-poslike1),700)); } else { Posloglike = poslike2 + log(logck(1.0+exp(poslike1-poslike2),701)); } Virtcts = (exp(poslike1)*virtcts1 + exp(poslike2)*virtcts2)/ (exp(poslike1) + exp(poslike2)); Virtctsp = (exp(poslike1)*virtctsp1 + exp(poslike2)*virtctsp2)/ (exp(poslike1) + exp(poslike2)); } if(Char == 'Z') { Char = 'Q'; tporig = porig(treeloc[Treeit][k],'m'); p1 = (*tporig).value; dp1 = (*tporig).error; poslike1 = Posloglike; virtcts1 = Virtcts; Char = 'E'; tporig = porig(treeloc[Treeit][k],'m'); p2 = (*tporig).value; dp2 = (*tporig).error; poslike2 = Posloglike; virtcts2 = Virtcts; p = (p1*poslike1+p2*poslike2)/(poslike1+poslike2); dp = (dp1*poslike1+dp2*poslike2)/(poslike1+poslike2); newcounts['Q'-'A'].value += p1*poslike1/(poslike1+poslike2); newcounts['Q'-'A'].error +=dp1*poslike1/(poslike1+poslike2); newcounts['E'-'A'].value += p2*poslike2/(poslike1+poslike2); newcounts['E'-'A'].error +=dp2*poslike2/(poslike1+poslike2); norm += p; if(poslike1 > poslike2) { Posloglike = poslike1 + log(logck(1.0+exp(poslike2-poslike1),710)); } else { Posloglike = poslike2 + log(logck(1.0+exp(poslike1-poslike2),711)); } Virtcts = (exp(poslike1)*virtcts1 + exp(poslike2)*virtcts2)/ (exp(poslike1) + exp(poslike2)); Virtctsp = (exp(poslike1)*virtctsp1 + exp(poslike2)*virtctsp2)/ (exp(poslike1) + exp(poslike2)); } } Virtctsp *= gamma; /* change of variables */ for(char1='A';char1<='Z';char1++) { treecounts[Treeit][pos1][char1-'A'].value = newcounts[char1-'A'].value; treecounts[Treeit][pos1][char1-'A'].error = newcounts[char1-'A'].error; } virttreecounts[Treeit][pos1] = Virtcts; virttreecountsp[Treeit][pos1] = Virtctsp; } void treechangesnormalize() { int i, maxi; if(NoSeq >=3) maxi = 2*NoSeq-2; else maxi = 4; for(i=0;i 0.0) (Top[Treeit][i]).changes /= changenorms[i]; } } void updatelengths(treeit) int treeit; { int i, maxi; Treeit = treeit; if(NoSeq >=3) maxi = 2*NoSeq-2; else maxi = 4; for(i=0;i=0) { k=0; for(j=0;j 55) { putc('\n', treefile); Col = 0; } treeout((*p).right,donetopl,raw); if (attop==1) { putc(',', treefile); treeout((*p).mother,donetopl,raw); } putc(')', treefile); Col++; } x = ((*p).length)*Rateequil*(1.0-Selfchange); if(raw==1) x = (*p).length; if (x > 0.0) w = (long)(0.43429448222 * log(x)); else if (x == 0.0) w = 0; else w = (long)(0.43429448222 * log(-x)) + 1; if (w < 0) w = 0; if (attop == 1) fprintf(treefile, ";\n"); else { fprintf(treefile, ":%*.5f", (int)(w + 7), x); Col += w + 8; } } /* treeout */ void updatelocaltopology(treeit) int treeit; { int i, maxi; Treeptr nodeo, nodex, tmpptr; Treeit = treeit; if(NoSeq >=3) maxi = 2*NoSeq-2; else maxi = 4; for(i=0;iseqno < 0 && ( ((Top[Treeit][i]).left)->length > 1.0/LengthSeq || ((Top[Treeit][i]).right)->length > 1.0/LengthSeq) && (Top[Treeit][i]).length <= (1.0-exp(- (((Top[Treeit][i]).left)->length)))* (1.0-exp(- (((Top[Treeit][i]).right)->length)))) { nodeo = &(Top[Treeit][i]); nodex = (Top[Treeit][i]).mother; printf("changing topology tree %d nodes %d and %d",Treeit,nodeo->seqno,nodex->seqno); treeout(Top[Treeit],0,0); if(nodeo->side != 'm') { printf("(a)\n"); (nodeo->right)->mother = nodex; if(nodeo->side == 'l') nodex->left = nodeo->right; else if(nodeo->side == 'r') nodex->right = nodeo->right; else printf("unknown side %c!\n",nodeo->side); (nodeo->right)->side = nodeo->side; nodeo->mother = nodex->mother; if(nodex->side == 'l') (nodex->mother)->left = nodeo; else if(nodex->side == 'r') (nodex->mother)->right = nodeo; else if(nodex->side == 'm') (nodex->mother)->mother = nodeo; else printf("unknown side %c!\n",nodex->side); nodeo->side = nodex->side; nodex->mother = nodeo; nodeo->right = nodex; nodex->side = 'r'; nodeo->length = nodex->length; if(nodeo->length >= 1.0/LengthSeq) nodeo->length-=.01/LengthSeq; nodex->length = .03/LengthSeq; if((nodeo->left)->length >= 1.0/LengthSeq) (nodeo->left)->length -= .01/LengthSeq; if(&(Top[Treeit][0]) == nodex) /* have to move 'root' */ { nodeo->right = nodeo->mother; nodeo->mother = nodex; (nodeo->right)->side = 'r'; nodex->side = 'm'; } } else { printf("(b)\n"); (nodeo->right)->mother = nodex; tmpptr = nodex->right; nodex->right = nodeo->right; (nodex->left)->mother = nodeo; nodeo->right = nodex->left; (nodeo->right)->side = 'r'; tmpptr->mother = nodex; nodex->left = tmpptr; (nodex->left)->side = 'l'; nodeo->length = .03/LengthSeq; nodex->length = .03/LengthSeq; } } } } double calccountschange() { int pos1,char1; double maxchange; maxchange = 0.0; for(pos1=0;pos1 < LengthSeq;pos1++) { for(char1=0;char1<26;char1++) { if(fabs(counts[pos1][char1].value - oldcounts[pos1][char1]) > fabs(maxchange)) { maxchange = counts[pos1][char1].value - oldcounts[pos1][char1]; } oldcounts[pos1][char1]=counts[pos1][char1].value; } } return(maxchange); } void average() { /* average stored counts over all trees */ int i,pos1,char1; double tmp, maxlike,sumlike,tweight; maxlike = loglikevect[0]; for(i=0; i < NoTrees; i++) if(loglikevect[i]> maxlike) maxlike=loglikevect[i]; sumlike = 0.0; for(i=0; i < NoTrees; i++) sumlike += exp(loglikevect[i]-maxlike); for(pos1=0;pos10.0) { for(pos1=0;pos1 maxcts) maxcts = (newcounts[char1-'A']).value; } if(maxcts > 0.0){ ntied = 0; for(char1='A';char1<='Z';char1++) { if((newcounts[char1-'A']).value == maxcts) ntied ++; } if(ntied == 0) { printf("ntied == 0!\n"); early_exit(3083); } Virtcts = virtcounts[pos1]; Virtctsp = virtcountsp[pos1]; gamma = rateequil[pos1]; lambda = norm + Virtcts + Virtctsp*(1.0/((double) ntied)-1.0/gamma); while(square(lambda)- 4.0*(Virtctsp/((double) ntied))* (norm-maxcts*((double) ntied)) < 0.0) { Virtctsp /= 2.0; /* total hack!*/ lambda = norm + Virtcts + Virtctsp*(1.0/((double) ntied)-1.0/gamma); } lambda = (lambda + sqrt(nonnegck(square(lambda)- 4.0*(Virtctsp/((double) ntied))* (norm-maxcts*((double) ntied)) ,911)))/2.0; /* else { printf("\n bad input to sqrt, %g, ntied= %d, gamma = %g, Virtctsp=%g", square(lambda)- 4.0*(Virtctsp/((double) ntied))* (norm-maxcts*((double) ntied)), ntied, gamma, Virtctsp); printf("Pos = %d, Char= %c", Pos, Char); printf("norm = %g, maxcts= %g, Virtcts= %g\n", norm, maxcts, Virtcts); } */ effectivevirtcts = ((lambda/(lambda-Virtctsp/((double) ntied)))* (maxcts+(Virtcts-Virtctsp/gamma)/((double) ntied)) - maxcts)* ((double) ntied); ctssofar = 0.0; if(effectivevirtcts >= 0.0) { for(char1='A';char1<='Z';char1++) { if(newcounts[char1-'A'].value == maxcts) { newcounts[char1-'A'].error += (2.0*newcounts[char1-'A'].value+effectivevirtcts/(double)ntied)* effectivevirtcts/(double) ntied; newcounts[char1-'A'].value += effectivevirtcts/(double) ntied; } } } else { /*find second largest counts:*/ smaxcts = 0.0; for(char1='A';char1<='Z';char1++) { if(newcounts[char1-'A'].value > smaxcts && newcounts[char1-'A'].value < maxcts ) smaxcts = newcounts[char1-'A'].value; } ctstoadd = Virtcts; while(effectivevirtcts/((double) ntied) + maxcts < smaxcts && smaxcts > 0.0) { ntied = 0; for(char1='A';char1<='Z';char1++) { if(newcounts[char1-'A'].value >= smaxcts) { ctstoadd -= (smaxcts-newcounts[char1-'A'].value); norm += (smaxcts-newcounts[char1-'A'].value); newcounts[char1-'A'].error += (2.0*newcounts[char1-'A'].value+(smaxcts-newcounts[char1-'A'].value))* (smaxcts-newcounts[char1-'A'].value); newcounts[char1-'A'].value = smaxcts; ntied ++; } } if(ntied ==0) { printf("ntied == 0!\n"); early_exit(3117); } ctssofar = Virtcts-ctstoadd; maxcts = smaxcts; lambda = norm + ctstoadd + Virtctsp*(1.0/((double) ntied)-1.0/gamma); lambda = (lambda + sqrt(nonnegck(square(lambda)- 4.0*(Virtctsp/((double) ntied))* (norm-maxcts*((double) ntied)) ,913)))/2.0; effectivevirtcts = ((lambda/(lambda-Virtctsp/((double) ntied)))* (maxcts+(ctstoadd-Virtctsp/gamma)/((double) ntied)) - maxcts)* ((double) ntied); smaxcts = 0.0; for(char1='A';char1<='Z';char1++) { if(newcounts[char1-'A'].value > smaxcts && newcounts[char1-'A'].value < maxcts ) smaxcts = newcounts[char1-'A'].value; } } /* end while*/ if(effectivevirtcts <= 0.0) { for(char1='A';char1<='Z';char1++) { if(newcounts[char1-'A'].value == maxcts) { if(smaxcts > 0.0) { newcounts[char1-'A'].error += (2.0*newcounts[char1-'A'].value+effectivevirtcts/(double)ntied)* effectivevirtcts/(double) ntied; newcounts[char1-'A'].value += effectivevirtcts/(double) ntied; } else { virtcounts[pos1] -= effectivevirtcts/(double)ntied; } } } } else { printf("\n bistable virtcts: Virtcts= %g, Virtctsp = %g, eff=%g\n", Virtcts,Virtctsp,effectivevirtcts); printf("lambda = %g, ntied = %d \n",lambda, ntied); early_exit(3168); } } /*end else*/ } /*end if */ } /*end for*/ } void savecounts() { int pos1, char1; double tmp; for(pos1=0;pos1 0.0 && p < 1.0) entro -= p*log(logck(p,222))/log(2.0); } entropy[pos1] = entro; } } void printprobs() { FILE *fp; int pos1, i; fp = Fopen("results","w"); for(i=0;i<26;i++) fprintf(fp,"%-24c ",'A'+i); fprintf(fp,"\n"); for(pos1=0;pos1 ltol && fabs(countschange) > COUNTTOL && sumloglike - loglikeinit < improve && numit < MAXUPIT) || numit < 2) || lasttime ==0) { if(!((likechange > ltol && fabs(countschange) > COUNTTOL && sumloglike - loglikeinit < improve && numit < MAXUPIT) || numit < 2)) lasttime = 1; numit ++; for(Treeit = 0; Treeit < NoTrees; Treeit++) { loglikelihood = 0.0; cleartreechanges(); for(pos1=0;pos1 1) { if(Posloglike+.000001 < oldposloglike[pos1]) { printf("likelihood decrease:Pos= %d, old= %g, new= %g\n", Pos,oldposloglike[pos1],Posloglike); } } if(Treeit == 1) oldposloglike[pos1] = Posloglike; */ poschangesupdate(); } loglikevect[Treeit] = loglikelihood; treechangesnormalize(); } /*done all trees */ for(Treeit = 0; Treeit < NoTrees; Treeit++) { if(UserTree < 3) updatelengths(Treeit); if(UserTree < 2) updatelocaltopology(Treeit); } average(); /* average olprob over trees */ if(lasttime ==1 && Convgd ==1) { savecounts(); /* adjust errors */ printcounts(); printgaps(); } applyvirtcts(); /* take virt cts into account*/ for(pos1=0;pos1 maxloglike) maxloglike = loglikevect[Treeit]; sumloglike = 0.0; for(Treeit = 0; Treeit < NoTrees; Treeit++) sumloglike += exp((loglikevect[Treeit]-maxloglike)); sumloglike = log(sumloglike) + maxloglike; countschange = calccountschange(); printf("sumloglike= %g dcounts = %g\n loglikes=", sumloglike,countschange); for(i=0;i= 2) { if( sumloglike + LTOL < oldloglike && Pscount == 0.0) { printf("danger: likelihood decrease in EM!\n"); printf("sumloglike %g, oldloglike %g\n",sumloglike,oldloglike); printf("numit = %d\n", numit); printf("\n loglikes="); for(i=0;i0.0) { printf("%c: %f \n",char1,(*ndata).pcharvect[char1-'A']); for(i=0;i< (*ndata).maxn[char1-'A'];i++) printf("%d: %g ",i+1,(*ndata).pnvect[char1-'A'][i]); printf("\n"); } printf("\n"); } void printlike() { FILE *fp; int i; double maxlike; maxlike = loglikevect[0]; for(i=0; i < NoTrees; i++) if(loglikevect[i]> maxlike) maxlike=loglikevect[i]; fp = Fopen("likelihood","w"); fprintf(fp,"loglikelihood= %g\n",maxlike); fclose(fp); } int comparetrees(tree1, tree2) Treeptr tree1, tree2; /*will find identical trees if numbered the same */ { int j; int numnodes; numnodes = 2*NoSeq - 2; j=0; while(j < numnodes && tree1[j].seqno == tree2[j].seqno && (*tree1[j].mother).seqno == (*tree2[j].mother).seqno) j++; if(j== numnodes) return(1); /* they match */ return(0); } int checktreenovelty() { int ttree; for(ttree=(Treeit+1)%NoTrees;ttree != Treeit; ttree=(ttree+1)%NoTrees) { if(comparetrees(Top[ttree],Top[Treeit]) == 1) { printf("\n exact match:"); printf(" (same tree as %d)\n",ttree); return(ttree); } } return(-1); } int checktreeconvergence() { int pos1, char1, poschange, charchange; double maxchange, minmaxchange; int ttree, tmp; /* see if tree we just made matches any previous tree */ maxchange = 1.0 + FINALTOL; minmaxchange = 1000.0; tmp = checktreenovelty(); if(tmp >= 0) return(tmp); for(ttree=(Treeit+1)%NoTrees;ttree != Treeit; ttree=(ttree+1)%NoTrees) { maxchange = 0.0; pos1=0; while(pos1 < LengthSeq && fabs(maxchange) < FINALTOL) { for(char1=0;char1<26;char1++) { if(fabs(treecounts[Treeit][pos1][char1].value - treecounts[ttree][pos1][char1].value) > fabs(maxchange)) { maxchange = treecounts[Treeit][pos1][char1].value - treecounts[ttree][pos1][char1].value; poschange = pos1; charchange = char1; } } if(fabs(virttreecounts[Treeit][pos1] - virttreecounts[ttree][pos1]) > fabs(maxchange)) { maxchange = virttreecounts[Treeit][pos1] - virttreecounts[ttree][pos1]; poschange = pos1; charchange = 1; } pos1 ++; } if(fabs(maxchange) < minmaxchange) minmaxchange = fabs(maxchange); } if(fabs(maxchange) < FINALTOL) /*converged!*/ { printf("\n close to tree %d \n",ttree); return(ttree); } else { printf("minmaxchange=%g \n",minmaxchange); return(-1); } } int checkconvergence() { int pos1, char1; double maxchange, minmaxchange; /* see if counts matches previous iteration */ maxchange = 1.0 + FINALTOL; minmaxchange = 0.0; pos1=0; maxchange = 0.0; while(pos1 < LengthSeq && fabs(maxchange) < FINALTOL) { for(char1=0;char1<26;char1++) { if(fabs(counts[pos1][char1].value - prevcounts[pos1][char1]) > fabs(maxchange)) { maxchange = counts[pos1][char1].value - prevcounts[pos1][char1]; } } pos1 ++; } if(fabs(maxchange) < FINALTOL) /*converged!*/ { printf("\n converged! \n"); return(1); } else { printf("\n maxchange = %g",maxchange); return(-1); } } void improvenewtree(ltol) double ltol; { int pos1, i; double likechange; double loglikelihood, bestloglike; int numit; double oldloglike, extraploglike; /*double oldposloglike[MaxLengthSeq];*/ numit = 0; likechange = ltol + 1.0; bestloglike = loglikevect[0]; for(i=0;i bestloglike) bestloglike = loglikevect[i]; loglikelihood = 2.0*bestloglike - 1.0; extraploglike = bestloglike; while(likechange > ltol && numit < MAXUPIT && loglikelihood < bestloglike && (extraploglike > bestloglike - LIKECUTOFF || numit < MINTREEIMPIT)) { numit ++; loglikelihood = 0.0; cleartreechanges(); for(pos1=0;pos1= 2) { if( loglikelihood + LTOL < oldloglike && Pscount == 0.0) { printf("warning: likelihood decrease in EM!\n"); /* printf("loglike %g, oldloglike %g\n",loglikelihood,oldloglike); printf("numit = %d\n", numit); printf("\n loglikes="); for(i=0;i= bestloglike) printf(" new tree %d optimized in %d its\n", Treeit, numit); else printf(" gave up improving new tree %d after %d its\n", Treeit, numit); } void newtree(tol) double tol; { int i, worsti; double minlike; Treeptr tmpptr; char command[70], numberstring[15]; tmpptr=allocatetree(); Treeit = NoTrees; if(tmpptr==NULL) { if(Iter==0) { printf("not enough memory for tree!\n"); printf("exiting...\n"); early_exit(2808); } else if(Iter<2) { printf("not enough memory for two trees!\n"); printf("try reducing MaxNoSeq or MaxLengthSeq\n"); printf("exiting...\n"); early_exit(2809); } else { printf("out of memory, replacing worst tree\n"); } } if(tmpptr==NULL||Treeit==TREEMAX) { minlike = loglikevect[0]; worsti = 0; for(i=0; i < NoTrees; i++) { if(loglikevect[i]< minlike) { minlike=loglikevect[i]; worsti = i; } } Treeit = worsti; } else { Top[NoTrees] = tmpptr; NoTrees ++; } if(UserTree==0) writedistances(); if(NoTrees==1 && UserTree==0) { if(TreeScript == 0) { printf("\nWill use weighbor to build trees\n"); system("which weighbor"); } else if(NoSeq > MaxSlowSeqs ) { printf("\nMore than MaxSlowSeqs seqs; will use treescript=:\n"); system("cat treescript"); } else if(zerodist == 1 ) { printf("\nBecause of identical seqs, will use treescript=:\n"); system("cat treescript"); } else if( NoSeq < 4 ) { printf("\nBecause less than 4 seqs, will use treescript=:\n"); system("cat treescript"); } else { printf("\nFewer than MaxSlowSeqs seqs; will use slowtreescript=:\n"); system("cat slowtreescript"); } } if(TreeScript == 0 && UserTree==0) { strncpy(command,"weighbor -b ",70); sprintf(numberstring,"%f ",1/Selfchange); strncat(command,numberstring,15); strncat(command,"-L ",15); sprintf(numberstring,"%f ",Eff_Length); strncat(command,numberstring,15); strcat(command,"< infile > treefile "); printf("command = %s\n", command); if(system(command) != 0) { printf("could not execute weighbor\n"); early_exit(3148); } } else if(UserTree==0 && (NoSeq > 16 || NoSeq < 4 || zerodist == 1)) { if(system("treescript")!= 0) { if(NoTrees==1) printf("could not execute treescript; trying neighbor\n"); if(system("echo y | nnneighbor > treescript.out")!=0) { if(NoTrees==1) printf("nnneighbor didn't work; trying neighbor\n"); if(system("echo y | neighbor > treescript.out")!=0) { if(NoTrees==1) printf("Hopefully you already have a valid tree installed in treefile.\n"); } } } } else if(UserTree==0) { if(system("slowtreescript")!= 0) { if(NoTrees==1) printf("could not execute slowtreescript; trying treescript\n"); if(system("treescript")!= 0) { if(NoTrees==1) printf("could not execute slowtreescript; trying fitch\n"); if(system("echo y | fitch > slowtreescript.out")!=0) { if(NoTrees==1) printf("trying nnneighbor\n"); if(system("echo y | nnneighbor > slowtree")!=0) { if(NoTrees==1) printf("Hopefully you already have a valid tree installed in treefile.\n"); } } } } } if((treefile=Fopen("treefile","r"))==NULL) { printf("can't open treefile for reading!\n"); early_exit(3013); } for(i=0;i 1) { improvenewtree(tol); } } void printbesttree() { int i, besti; double maxlike; treefile = Fopen("besttree","w"); Col = 0; maxlike = loglikevect[0]; besti = 0; for(i=0; i < NoTrees; i++) { if(loglikevect[i]> maxlike) { maxlike=loglikevect[i]; besti = i; } } treeout(Top[besti],0,0); fclose(treefile); treefile = Fopen("besttree.raw","w"); treeout(Top[besti],0,1); fclose(treefile); } void printbesttrees() { int i, besti, sbesti; double maxlike; treefile = Fopen("besttree","w"); Col = 0; maxlike = loglikevect[0]; besti = 0; for(i=0; i < NoTrees; i++) { if(loglikevect[i]> maxlike) { maxlike=loglikevect[i]; besti = i; } } treeout(Top[besti],0,0); fclose(treefile); treefile = Fopen("besttree.raw","w"); treeout(Top[besti],0,1); fclose(treefile); /* now 2nd best: */ treefile = Fopen("2ndbesttree","w"); Col = 0; if(besti == 0) { maxlike = loglikevect[1]; sbesti = 1; } else { maxlike = loglikevect[0]; sbesti = 0; } for(i=0; i < NoTrees; i++) { if(loglikevect[i]> maxlike && i != besti) { maxlike=loglikevect[i]; sbesti = i; } } treeout(Top[sbesti],0,0); fclose(treefile); } void cpnode(fromnode,tonode) Treeptr fromnode, tonode; { (*tonode).seqno = (*fromnode).seqno; (*tonode).length = (*fromnode).length ; if((*tonode).seqno <0) { (*tonode).left = tonode + ((*fromnode).left-fromnode); (*tonode).right = tonode + ((*fromnode).right-fromnode); } else { (*tonode).left = NULL; (*tonode).right = NULL; } (*tonode).mother = tonode + ((*fromnode).mother-fromnode); (*tonode).side = (*fromnode).side ; } void cptree(fromtree,totree) Treeptr fromtree, totree; { int j; int numnodes; numnodes = 2*NoSeq - 2; for(j=0;j bestlike) bestlike = loglikevect[i]; if(loglikevect[i] < worstlike) { worstlike = loglikevect[i]; worsti = i; } } if(NoTrees > TREEMIN && worstlike - bestlike < -LIKECUTOFF) { if(worsti != NoTrees - 1) /* discard worst tree */ { loglikevect[worsti] = loglikevect[NoTrees-1]; cptree(Top[NoTrees - 1], Top[worsti]); for(i=0;i