#include "bioseq.h"#include <cctype>#include "strformat.h"#include <fstream>#include <math.h>Functions | |
| void | translate (string &pep, const string &seq, int begin, int end) |
| int | countInternalStops (const string &seq) |
| int | aachar2num (char a) |
| char | aanum2char (int c) |
| Range | P2Rindex (const Range &ofb, const int frame) |
| Interval | P2Rindex (const Interval &ofb, const int frame) |
| int | P2Rindex (const int pos, const int frame) |
| bool | overlayRVR (const Range &r, const vector< Range > &vr) |
| bool | farRVR (const Range &r, const vector< Range > &vr, const int hhc, const int htc, const int ttc) |
| vector< Range > | findAllORFIndex (const string &rna, int base, int peplen_cutoff, int HHcut, int HTcut, int TTcut) |
| void | findAllPepORFIndex (list< Range > &orfrange, const string &ss, int cutoff) |
| string | reverseComplement (const string &seq) |
| char | complementChar (char ch) |
| void | reverseComplementInPlace (string &seq) |
| void | printFasta (ostream &ous, const string &seq, int width) |
| ostream & | operator<< (ostream &ous, const bioseq &s) |
| pair< int, int > | longestORFIndex (const Protein &p) |
| void | longestORFPlus (const string &rna, string &pep, int &b, int &e) |
| void | longestORFPlus (const string &rna, string &pep, int &b, int &e, int &f) |
| void | longestORFPlusSuffix (const string &rna, string &pep, int &b, int &e) |
| void | longestORFPlusPrefix (const string &rna, string &pep, int &b, int &e) |
| bool | longestNoStartORFPlus (const string &rna, pair< int, int > &nterm, string &npep, pair< int, int > &full, string &pep) |
| bool | longestNoStopORFPlus (const string &rna, pair< int, int > &cterm, string &cpep, pair< int, int > &full, string &pep) |
| bool | loadFastaIntoMap (const string &file, map< string, string > &store) |
| int aachar2num | ( | char | a | ) |
used by encode function
Referenced by Matrix::lookup(), and Matrix::read().
| char aanum2char | ( | int | c | ) |
convert amino acid number to character
| char complementChar | ( | char | ch | ) |
Referenced by reverseComplementInPlace().
| int countInternalStops | ( | const string & | seq | ) |
global function to be used in a more flexible way
Referenced by JGIModel::valid().
| bool farRVR | ( | const Range & | r, | |
| const vector< Range > & | vr, | |||
| const int | hhc, | |||
| const int | htc, | |||
| const int | ttc | |||
| ) |
Referenced by findAllORFIndex().
| vector<Range> findAllORFIndex | ( | const string & | rna, | |
| int | base, | |||
| int | peplen_cutoff, | |||
| int | HHcut, | |||
| int | HTcut, | |||
| int | TTcut | |||
| ) |
References farRVR(), findAllPepORFIndex(), P2Rindex(), reverseComplement(), and translate().
Referenced by ESTAssemblyid::breakup(), and testFindORF().
| void findAllPepORFIndex | ( | list< Range > & | orfrange, | |
| const string & | ss, | |||
| int | cutoff | |||
| ) |
find all the ORF index bounds in 0-based index Number is in protein space minimum AA length to register. 25 aa is the minimum we are goint to register.
| orfrange | is the result of this operation. It will make it empty at the begining of the run if it was not. The output is in ss index. the end of the range is the '*' symbol for complete ORF or prefix ORF. The begin of the range is the index of 'M'. | |
| ss | is the input peptide sequence is a integer number. Peptide shorter than this are ignored in the searching phase. This parameter should not affect the real performance of this algorithm. It provided fine control. Currently I am using 25 aa. |
References max.
Referenced by findAllORFIndex().
| bool loadFastaIntoMap | ( | const string & | file, | |
| map< string, string > & | store | |||
| ) |
load all the sequences in the file into a map for later usage. Only the id is used, title information is not cached.
References ifstream(), and bioseq::seq.
Referenced by main(), testFromFile(), and testLoad().
| bool longestNoStartORFPlus | ( | const string & | rna, | |
| pair< int, int > & | nterm, | |||
| string & | npep, | |||
| pair< int, int > & | full, | |||
| string & | pep | |||
| ) |
Find the longest ORF of 1----*, and M---*, return both of them in one operation Let the user decide what to do with the two values. The * stands for stop codon. use 0-based index, inclusive [b,e] e is the third Base of the stop codon. b is the first base of the start codon if complete ORF. If nostart orf, the b is the frame, implying start from 0. b,e is packed into the pair data structure.
nterm, and full contain the [b,e] in RNA coordinates.
If the sequence does not contain NoStart or Full ORF, then return false;
References bioseq::length(), and DNA::translate().
Referenced by ESTAssembly::breakPrefixModel(), ESTAssembly::breakSuffixModel(), and testLongestNMissingORF().
| bool longestNoStopORFPlus | ( | const string & | rna, | |
| pair< int, int > & | cterm, | |||
| string & | cpep, | |||
| pair< int, int > & | full, | |||
| string & | pep | |||
| ) |
References bioseq::length(), and DNA::translate().
Referenced by ESTAssembly::breakPrefixModel(), ESTAssembly::breakSuffixModel(), testLongestNMissingORF(), and testLongestPlus().
| pair<int,int> longestORFIndex | ( | const Protein & | p | ) |
| void longestORFPlus | ( | const string & | rna, | |
| string & | pep, | |||
| int & | b, | |||
| int & | e, | |||
| int & | f | |||
| ) |
find all ORF (in the middle of pepseq M...*) or ...* or M... in all three reading frames of rna, and pick the longest one. Set the pep seq, and b, e as 0-based index inclusive [b,e] in RNA index
| f | is the frame [0,1,2] actually frame can be derived from b % 3 frame is always b3, actual begin is 0 if b < 3 and start is not M |
References Interval::begin(), Interval::end(), find(), Interval::length(), bioseq::length(), Range::length(), P2Rindex(), bioseq::substr(), and DNA::translate().
| void longestORFPlus | ( | const string & | rna, | |
| string & | pep, | |||
| int & | b, | |||
| int & | e | |||
| ) |
References longestORFPlus().
Referenced by longestORFPlus(), mRNAModel::reset(), ESTAssembly::setCDSInfo(), mRNAModel::setLongestCDSAndProtein(), testFindORF(), and testLongestPlus().
| void longestORFPlusPrefix | ( | const string & | rna, | |
| string & | pep, | |||
| int & | b, | |||
| int & | e | |||
| ) |
Prefix ORF is the one with stop but no start
References bioseq::length(), and DNA::translate().
| void longestORFPlusSuffix | ( | const string & | rna, | |
| string & | pep, | |||
| int & | b, | |||
| int & | e | |||
| ) |
Suffix ORF is ORF with start but not stop
References bioseq::length(), and DNA::translate().
| ostream& operator<< | ( | ostream & | ous, | |
| const bioseq & | s | |||
| ) |
fasta format, 70 residues per line. If the sequence has name it will produce a full fasta file, otherwise only the sequence.
References bioseq::name, bioseq::printFasta(), bioseq::seq, and bioseq::title.
helper function for findAllORFIndex()
References Range::overlay().
| int P2Rindex | ( | const int | pos, | |
| const int | frame | |||
| ) |
References Interval::begin(), and Interval::end().
this translated full ORF for prefix and suffix ORF you need special treatment prefix is ok, will use the start to encode frame information; if start < 3 then it contain frame info.
References Range::begin(), and Range::end().
Referenced by findAllORFIndex(), and longestORFPlus().
| void printFasta | ( | ostream & | ous, | |
| const string & | seq, | |||
| int | width = 70 | |||
| ) |
class wide function that can be used without constructing a bioseq object. This will increase the performance if you are only interested in the operation but not in using the bioseq object and its derived classes
Referenced by extractInter(), main(), mRNAModel::show(), testLoad(), and ESTAssembly::write().
| string reverseComplement | ( | const string & | seq | ) |
operation only make sense on DNA or RNA sequences
Referenced by ESTAssembly::breakPrefixModel(), ESTAssembly::breakSuffixModel(), GenModel::CDSSeq(), checkstop(), countNInIntrons(), findAllORFIndex(), RNAModel::intronBound(), main(), mRNAModel::reset(), mRNAModel::setLongestCDSAndProtein(), and stopsInIntrons().
| void reverseComplementInPlace | ( | string & | seq | ) |
a version for better performance
References complementChar().
Referenced by main(), DNA::revcomp(), RNAModel::seqGenomic(), ESTAssembly::setCDSInfo(), Noschain::subsequence(), and testLongestNMissingORF().
| void translate | ( | string & | pep, | |
| const string & | seq, | |||
| int | begin, | |||
| int | end = -1 | |||
| ) |
Algorithms about ORF: prefix ORF ORF starting from 5'-END without start codon. suffix ORF ORF end at 3' end without stop codon full ORF ORF with start and stop codons Protein index to RNA index transformation: in 0-based index system Ri=frame + 3*Pi First base of codon For end of stop codon, you need to add 2 helper function to be used by other methods Uses more basic type, for convinience use 1-based index inclusive.
| begin | start position of translation, first base. | |
| end | end position of translation, third base of codon if not at the end of the sequence. This can be any of the codon bases. It just generate partial peptides with the last amino acid unspecified. |
References ct, and DNA::getCodonTable().
Referenced by findAllORFIndex(), mRNAModel::mRNAModel(), mRNAModel::resetProtein(), and JGIModel::valid().
1.5.6