#include <iostream>#include <fstream>#include <string>#include <vector>#include <set>#include <cstdlib>#include <cstring>#include <cassert>#include "strformat.h"#include <boost/math/distributions/chi_squared.hpp>Classes | |
| class | lesschptr |
Functions | |
| void | readIdList (set< string > &ids, const string &file) |
| void | readIdList (set< const char *, lesschptr > &ids, const string &file) |
| void | readIdFromFasta (set< const char *, lesschptr > &ids, const string &file) |
| vector< int > | processModels (const string &modfile, const vector< set< const char *, lesschptr > > &estlibs, const vector< string > &libfiles, vector< int * > &result, ostream &log) |
| int | hitCount (const string &ids, const vector< set< const char *, lesschptr > > &estlib, vector< set< const char *, lesschptr > > &mapped, int *&counts) |
| int | countESTIDs (const string &modfile) |
| void | usage () |
| void | releaseMemory (vector< set< const char *, lesschptr > > &estlibs) |
| void | releaseResult (vector< int * > &res) |
| void | ChisquareAndRatio (vector< int * > &counts, vector< int > &overallcount, ostream &ous, ostream &RAE, ostream &log) |
| void | outputHeader (ostream &ous, const vector< string > &libfiles) |
| string | nameraefile (const string &countfile) |
| int | main (int argc, char *argv[]) |
| void ChisquareAndRatio | ( | vector< int * > & | counts, | |
| vector< int > & | overallcount, | |||
| ostream & | ous, | |||
| ostream & | RAE, | |||
| ostream & | log | |||
| ) |
| int countESTIDs | ( | const string & | modfile | ) |
since one est can be assembled into multiple models, I have to produce a unique list from the input models
References ifstream().
Referenced by main().
| int hitCount | ( | const string & | ids, | |
| const vector< set< const char *, lesschptr > > & | estlib, | |||
| vector< set< const char *, lesschptr > > & | mapped, | |||
| int *& | counts | |||
| ) |
| counts | store the number of hits in each library |
| ids | is the id string from each combest model result. Ids are separated with comma "FYT9WAP01BTJGL,FYT9WAP01CKLAO,FYT9WAP01DZQZM". This is the input. This function will simply convert this list into a count for each input library as given by estlib and mapped. | |
| estlib | a vector of set of estids from each library. This is the other input for this function. This contain the ESTids from each input library. Not all of the ESTs may be mapped to genome and so will not be included into any models. | |
| mapped | this is a result accumulator that contains the exact ESTids that are mapped from each EST library. There is a one-to-one library conrespondence between this and estlib. They should have the same size unless for some reason, none of the EST from one input estlib maps. | |
| counts | is the output result for this particular model. This result vector has the dimension of one extra than estlib the last element is used for holding modelid. So you have 10 EST ids after this function call, counts could be |3|0|5|2|formodid| with four input library. |
References find(), and split().
Referenced by processModels().
| int main | ( | int | argc, | |
| char * | argv[] | |||
| ) |
| string nameraefile | ( | const string & | countfile | ) |
Referenced by main().
| void outputHeader | ( | ostream & | ous, | |
| const vector< string > & | libfiles | |||
| ) |
| vector< int > processModels | ( | const string & | modfile, | |
| const vector< set< const char *, lesschptr > > & | estlibs, | |||
| const vector< string > & | libfiles, | |||
| vector< int * > & | result, | |||
| ostream & | log | |||
| ) |
pointer version for production. This is more efficient. This is the main function for doing the main job.
| modfile | the combest output file name. This funciton will only use the ESTids column. | |
| estlibs | is the input est library ESTids preloaded. | |
| libfiles | the library file names contain lists of ESTids. This is only used for output headers for the result file. | |
| result | is a vector of mapping result for each input model. The last column is modelid. |
References hitCount(), and ifstream().
Referenced by main().
| void readIdFromFasta | ( | set< const char *, lesschptr > & | ids, | |
| const string & | file | |||
| ) |
| void readIdList | ( | set< const char *, lesschptr > & | ids, | |
| const string & | file | |||
| ) |
This is the memory efficient version, and faster Given a files of ESTIds. The format of the file is one or more ESTids separated by one or more space (tab or space). This function will store the result into the set.
| file | is usually named after the library that contains all ESTids from one condition. | |
| ids | container to hold the reusult. The result pointer memory needs to be freed with delete. |
References ifstream().
| void readIdList | ( | set< string > & | ids, | |
| const string & | file | |||
| ) |
Older version using set<string>, takes too much memory replaced with pointer version set<char*, lesschptr> this function will get rid of redundant entires. id file format 1. >id one or more per line 2. id1 id2 ... 3. fasta file
References ifstream().
Referenced by main().
| void releaseMemory | ( | vector< set< const char *, lesschptr > > & | estlibs | ) |
Referenced by clustergene(), groupModel(), and main().
| void releaseResult | ( | vector< int * > & | res | ) |
Referenced by main().
| void usage | ( | ) |
given gmap summary format, this program converts it into combest archive format (*.car)
It can be used a pipe, or given specific file names.
this is a helper program to count distinct estids in the ESTId column of the combest result. So it is the actual number of ESTs mapped. This could be lower for deeply covered genome because of the coverage depth-dependent filtering.
given gmap summary format, this program converts it into combest archive format (*.car)
It can be used a pipe, or given specific file names.
this is a helper program to count distinct estids in the ESTId column of the combest result. So it is the actual number of ESTs mapped. This could be lower for deeply covered genome because of the coverage depth-dependent filtering.
1.5.6