00001 /* declarations common to all dynamic programming molecular biology modules */ 00002 #ifndef SEQALN_H 00003 #define SEQALN_H 00004 00005 #ifndef MAXLINE 00006 #define MAXLINE 512 00007 #endif 00008 00009 #ifndef MAXPATH 00010 #define MAXPATH 2000 00011 #endif 00012 00013 #ifndef MAXSEQNAME 00014 #define MAXSEQNAME 1024 00015 #endif 00016 00017 #ifndef stdout 00018 #include <stdio.h> // there are pure C programs not C++ 00019 #endif 00020 00021 static char *SEQALN_VERSION="1.20"; 00022 00023 /* the type of file, for a file used for input */ 00024 typedef enum { SEQ, MAT, PRO, DIS, DB, PIPE, SOCKET } SEQALN_FTYPE; 00025 00026 /* the type of database, for a sequence file used for input 00027 * file format*/ 00028 typedef enum { UNKNOWN, GENBANK, NEWAT, FASTA, PIR, SWISSPROT } 00029 SEQALN_DBTYPE; 00030 00031 /* the io struct holds input/output information */ 00032 struct SEQALN_IO { 00033 FILE *STDIN; /* Place to obtain terminal input */ 00034 FILE *STDOUT; /* Place to put stdout output. */ 00035 FILE *STDERR; /* Place to put stderr output. */ 00036 }; 00037 00038 /* type SEQALN_TRACE is used for tracebacks; a node */ 00039 typedef struct SEQALN_LSTRUCT { struct SEQALN_LSTRUCT *next; int i,j; } 00040 SEQALN_TRACE; 00041 00042 /* holds (i,j) pairs for multiple locations of one score */ 00043 typedef struct { int score, i, j; } SEQALN_SIJ; 00044 00045 /* 00046 Sequences begin in position 1 in seq[] and code[]; 00047 position 0 corresponds to row or column zero in the scoring matrix, 00048 which holds initial gap scores. 00049 To print a sequence, use for example `printf("%s\n",&seq->seq[1]);' 00050 */ 00051 struct SEQALN_SEQUENCE { 00052 int len; /* Sequence length */ 00053 char name[MAXSEQNAME];/* Sequence name */ 00054 int reverse; /* The sequence has been reversed */ 00055 int complement; /* The sequence has been complemented */ 00056 int revcomp; /* This is the reverse complement of the real seq */ 00057 int xlated; /* Whether nucleotide->protein translation was done */ 00058 char *seq; /* To hold letters of the sequence len+2 size */ 00059 char *code; /* To hold letter numbers [0..26] of the sequence */ 00060 int bstart, bend; /* Start, stop locations of band sequence (for band)*/ 00061 int fstart, fend; /* Start, stop locations of ORF */ 00062 double *freq; /* Frequency distribution of letters (for p-value) */ 00063 char *fname; /* File name, where we obtain the sequence(s) */ 00064 FILE *fptr; /* Pointer to current location in database */ 00065 long fposn; /* Offset from start of file for next sequence */ 00066 SEQALN_FTYPE ftype; /*Type of file from which input is obtained */ 00067 SEQALN_DBTYPE dbtype; /* Type of database, if one is used for input*/ 00068 int maxlen; /* Length allocated for this buffer (max seq len) */ 00069 int resized; /* Denotes whether maxlen changed for last seq read */ 00070 }; 00071 00072 /* type prorow is used for profiles */ 00073 typedef int SEQALN_PROROW[26]; 00074 00075 /* 00076 A profile is an array of 26 columns and `len' rows. 00077 */ 00078 struct SEQALN_PROFILE { 00079 int len; 00080 char name[MAXSEQNAME]; /* Profile name */ 00081 SEQALN_PROROW *profile; 00082 int alpha,beta; 00083 char *fname; /* Name of file containing profile */ 00084 FILE *fptr; /* Offset into profile file [not used] */ 00085 }; 00086 00087 enum PTYPE { GLOBAL, FIT, OVER, LOCAL }; /* Program type */ 00088 enum STYPE { MATRIX, NONMATRIX, PROFILE }; /* Score function type */ 00089 enum MTYPE { DISTANCE, SIMILARITY }; 00090 /* Full / self-repeat / tandem-repeat / band matrix */ 00091 enum BTYPE { FULL, SELF, TANDEM, BAND }; 00092 enum MEMUSE { LINEAR, SQUARE };/* How much memory we use */ 00093 enum STATS {NONE, PVALUE }; /* Type of statistics to compute */ 00094 00095 /* 00096 These are all the scoring constants used by the software; 00097 once established for comparing two sequences, they are expected to 00098 remain unchanged. 00099 */ 00100 struct SEQALN_CONSTANTS { 00101 int alpha, beta; /* Gap score parameters, positive */ 00102 int csub; /* Limit for printing conservative substitutions */ 00103 int revcomp; /* 0: seq1 is normal; 1: use seq1's reverse compl. */ 00104 int cutoff; /* Minimum score when reporting multiple scores */ 00105 int naligns; /* No. of multiple alignments requested */ 00106 int rptalign; /* Whether to report alignment */ 00107 int rptmat; /* Whether to report scoring matrix dump */ 00108 int rptscore; /* Whether to report score */ 00109 int rptptable; /* Table of score/obs. prob/est. prob for p-value */ 00110 int rpttrace; /* Whether to report traceback list */ 00111 int rptboth; /* Whether to perform both upper & lower tracebacks */ 00112 int verbose; /* Whether to print in verbose mode */ 00113 int width; /* No. of chars in a line of seq. alignment */ 00114 /* problem for C++ compiler used for pdelta(int, char, void*)*/ 00115 int (*deltafn)(); /* used for delta(a,b) scoring of 2 chars */ 00116 int traceupper; /* Preference to take in tracebacks */ 00117 /* 0 ==> lower envelope; non-0 ==> upper envelope*/ 00118 int envelope; /* =1 if tracing upper & lower alignment envelopes */ 00119 int flip; /* To flip (toggle) traceupper */ 00120 int showmmatch; /* Print '*' (mismatch) in align., not '|' (match) */ 00121 /* Profile specific */ 00122 int palpha, pbeta; /* Gap scores for indels in a profile */ 00123 /* Overlap specific */ 00124 int overout; /* Types of overlap alignments to output: */ 00125 /* 0:highest score; 1:last row; 2:last col. 3:both */ 00126 /* P-value specific */ 00127 int nsims; /* # of simulations to perform */ 00128 int ndeclumps; /* # of declumps for each simulation */ 00129 int allscores; /* 1: use all simulation scores; 0: use middle 80% */ 00130 double gamma; /* Value of gamma */ 00131 double p; /* Value of p */ 00132 char *randfile; /* File with GFSR random number generator state */ 00133 /* Non-matrix specific */ 00134 int match, mismatch; /*Match/mismatch non-matrix score parameters*/ 00135 /* Matrix specific */ 00136 int xmatrix[32][32]; /*Scoring matrix; use 26x26 */ 00137 int offset; /* Offset for elements of matrix */ 00138 /* Tandem repeat specific */ 00139 int window; /* Pattern stride for wraparound local similarity */ 00140 /* Band specific */ 00141 int center, Delta1, Delta2; /* center & Delta on each side for 2nd seq. */ 00142 /* Miscellaneous paramters */ 00143 int Debug; /* Debug flag, for debugging output */ 00144 enum PTYPE ptype; 00145 enum STYPE stype; 00146 enum MTYPE mtype; 00147 enum BTYPE btype; 00148 enum MEMUSE memuse; 00149 enum STATS stats; 00150 }; 00151 00152 /* 00153 This structure holds intermediate and final results of scoring. 00154 */ 00155 struct SEQALN_RESULTS { 00156 int **S; /* Scoring matrix size=colxrow */ 00157 int S_size; // maximum allocated size of S 2d-array 00158 //int S_size; /* max(seq1len)*max(seq2len) so far */ 00159 int *E; /* Indels from one direction, only for declumping */ 00160 int *F; /* Indels from other direction */ 00161 int *best; /* Best score limit for any row or column */ 00162 int bestS; /* Maximum score */ 00163 int bestSi, bestSj; /* Coordinates in S of maxS */ 00164 int naligns; /* No. of multiple alignments left, when requested */ 00165 int jstart; /* Starting location in scoring matrix, for band */ 00166 int jmax; /* Ending location in scoring matrix, for band */ 00167 int maxentry; /* High score for each match */ 00168 SEQALN_TRACE *trace; /* Current traceback list */ 00169 char **aligned; /* For reporting multiple alignments -- need 1 bit */ 00170 char *linbits; /* For reporting multiple alignments with wrapping */ 00171 SEQALN_SIJ *listsij; /* For holding (i,j) pairs for multiple alignments */ 00172 int listcount; /* Count of multiple alignments */ 00173 struct SEQALN_PVALUE *pvalue; /*holding p-value related information*/ 00174 /* Information for alignment outputs. */ 00175 int start1,start2; /*Starting positions of alignment in seq1, seq2 */ 00176 char *align0; /* Alignment string for match/mismatch of seqs */ 00177 char *align1; /* Alignment string for 1st sequence */ 00178 char *align2; /* Alignment string for 2nd sequence */ 00179 int align_len; /* Length of each of the above 3 alignment strings */ 00180 int nogap_len; 00181 int max_align_len; 00182 int collen; /* current length of column */ 00183 int rowlen; /* intented to save the allocation deallocation */ 00184 /* the next four is for looking at the distribution of the match */ 00185 int match_freq1[26]; // residue frequence in the matched region 00186 int match_freq2[26]; // [Residue_char - 'A'] is the index number 00187 int iden_count; // identical residues 00188 int siml_count; // similar residue counts 00189 int end1, end2; /*end position of align in each sequence */ 00190 }; 00191 00192 /* 00193 This structure holds intermediate and final results unique to p-value 00194 software, so the results data structure doesn't have to hold them if 00195 p-value software is not used. 00196 */ 00197 struct SEQALN_PVALUE { 00198 double p; /* value for p in lambda = gamma * m * n * p^t */ 00199 double gamma; /* value for gamma in lambda = gamma * m * n * p^t */ 00200 double pvalue; /* value for p-value */ 00201 double lambda; /* lambda = gamma * seq1len * seq2len * p^score */ 00202 double coeff; /* if -0.99 < coeff < 0.99, unreliable parameters */ 00203 int nsims; /* number of simulations requested */ 00204 int ndeclumps; /* number of declumps for each simulation */ 00205 int sims2go; /* simulations left to perform */ 00206 int declumps2go; /* declumps left to perform for this simulation */ 00207 int seq1len; /* length to use for `m', the size of sequence 1 */ 00208 double seq1dist[26]; /* letter frequency distribution of sequence 1 */ 00209 int seq2len; /* length to use for `n', the size of sequence 2 */ 00210 double seq2dist[26]; /* letter frequency distribution of sequence 2 */ 00211 int *hist; /* histogram of scores, for linear regression */ 00212 int nhist; /* number of elements in histogram */ 00213 int npvals; /* number of p-values computed w/ declumping so far */ 00214 }; 00215 00216 #ifdef __STDC__ 00217 #if __STDC__==1 00218 #define INLINE inline 00219 #else 00220 #define INLINE 00221 #endif 00222 #endif 00223 00224 /* The Makefile-supplied definition overrides the compiler's default */ 00225 00226 #ifdef HAS_INLINE 00227 #if HAS_INLINE==1 00228 #define INLINE inline 00229 #else 00230 #define INLINE 00231 #endif 00232 #endif 00233 00234 /* For non-ANSI cc compiler, don't expect `inline' support */ 00235 00236 #ifndef INLINE 00237 #define INLINE 00238 #endif 00239 00240 #endif
1.5.6