|
testTests.cppGo to the documentation of this file.00001 #include "gn/gnSequence.h" 00002 #include "gn/gnGBKSource.h" 00003 #include "gn/gnFASSource.h" 00004 #include <algorithm> 00005 00006 static boolean LocationLessthan(const gnLocation& a, const gnLocation& b); 00007 00008 static boolean LocationLessthan(const gnLocation& a, const gnLocation& b){ 00009 return a.GetStart() < b.GetStart(); 00010 } 00011 00012 static boolean LocationEndLessthan(const gnLocation& a, const gnLocation& b); 00013 00014 static boolean LocationEndLessthan(const gnLocation& a, const gnLocation& b){ 00015 return a.GetEnd() < b.GetStart(); 00016 } 00017 00018 static boolean LocationSizeLessthan(const gnLocation& a, const gnLocation& b); 00019 00020 static boolean LocationSizeLessthan(const gnLocation& a, const gnLocation& b){ 00021 return a.GetEnd() - a.GetStart() < b.GetEnd() - b.GetStart(); 00022 } 00023 00024 // create a location list which contains every intervening region 00025 // in l_list 00026 void ComplementLocation(vector<gnLocation>& l_list){ 00027 if(l_list.size() < 2) 00028 return; //need 2 or more locations to get the in between regions 00029 00030 vector<gnLocation> comp_list; 00031 00032 sort(l_list.begin(), l_list.end(), &LocationLessthan); 00033 00034 for(uint32 locationI = 1; locationI < l_list.size(); locationI++){ 00035 00036 //if they don't intersect there is an intervening region 00037 if( !l_list[locationI].Intersects( l_list[locationI-1] ) ){ 00038 00039 //create the region between locationI and locationI - 1 00040 gnLocation between_loc; 00041 between_loc.SetStart(l_list[locationI-1].GetEnd()); 00042 between_loc.SetEnd(l_list[locationI].GetStart()); 00043 //add it to the complement list 00044 comp_list.push_back(between_loc); 00045 } 00046 00047 } 00048 00049 l_list = comp_list; 00050 } 00051 00052 void DeleteRepeats(list<gnLocation>& loc_list){ 00053 00054 loc_list.sort(&LocationLessthan); 00055 00056 list<gnLocation>::iterator loc_iter = loc_list.begin(); 00057 list<gnLocation>::iterator cur_loc; 00058 for(; loc_iter != loc_list.end(); loc_iter++){ 00059 cur_loc = loc_iter; 00060 cur_loc++; 00061 while(cur_loc != loc_list.end() && !LocationEndLessthan(*loc_iter, *cur_loc)){ 00062 if(loc_iter->Intersects(*cur_loc)){ 00063 *loc_iter = loc_iter->GetUnion(*cur_loc); 00064 list<gnLocation>::iterator to_del = cur_loc; 00065 cur_loc--; 00066 loc_list.erase(to_del); 00067 } 00068 cur_loc++; 00069 } 00070 } 00071 } 00072 00073 void WriteData(gnSequence& file_sequence, list<gnLocation>& loc_list, string& filename){ 00074 gnSequence loc_seq; 00075 cout << "Gathering sequence data to write...\n"; 00076 list<gnLocation>::iterator loc_iter = loc_list.begin(); 00077 uint32 loc_count = loc_list.size(); 00078 uint32 notice_interval = loc_count / 10; 00079 uint32 cur_locI = 0; 00080 for(;loc_iter != loc_list.end(); loc_iter++){ 00081 loc_seq += file_sequence.ToString( loc_iter->GetEnd() - loc_iter->GetStart(), loc_iter->GetStart()); 00082 if(cur_locI % notice_interval == 0){ 00083 cout << (cur_locI / loc_count) * 10 << "%.."; 00084 } 00085 cur_locI++; 00086 } 00087 cout << "\nWriting sequence data...\n"; 00088 gnFASSource::Write(loc_seq, filename); 00089 } 00090 00091 void WriteList(list<gnLocation>& loc_list, string& filename){ 00092 ofstream list_out(filename.c_str()); 00093 list<gnLocation>::iterator loc_iter = loc_list.begin(); 00094 for(;loc_iter != loc_list.end(); loc_iter++){ 00095 list_out << loc_iter->GetStart() << '\t' << loc_iter->GetEnd() << '\n'; 00096 } 00097 list_out.close(); 00098 } 00099 00100 void print_usage(char* pname){ 00101 cout << "Usage: " << pname << " <genbank file> <exon_list> <intron_list> <exon_seq> <intron_seq>\n"; 00102 } 00103 00104 int main( int argc, char* argv[] ) 00105 { 00106 boolean run_interactive = false; 00107 string seq_filename; 00108 string exon_list_filename; 00109 string intron_list_filename; 00110 string exon_seq_filename; 00111 string intron_seq_filename; 00112 // define a gnSequence to store the sequence 00113 gnSequence file_sequence; 00114 00115 if(!run_interactive){ 00116 // check for correct calling semantics 00117 if(argc != 6){ 00118 print_usage(argv[0]); 00119 return -1; 00120 } 00121 00122 seq_filename = argv[1]; 00123 exon_list_filename = argv[2]; 00124 intron_list_filename = argv[3]; 00125 exon_seq_filename = argv[4]; 00126 intron_seq_filename = argv[5]; 00127 }else{ 00128 00129 // Get the name of the sequence to load 00130 cout << "Enter the name of the sequence file to load:\n" ; 00131 cin >> seq_filename; 00132 } 00133 00134 // Load the sequence and tell the user if it loaded successfully 00135 if(file_sequence.LoadSource(seq_filename)) 00136 { 00137 cout << seq_filename << " loaded successfully. " << file_sequence.length() << " base pairs.\n"; 00138 }else{ 00139 cout << "Error loading file.\n"; 00140 return -1; 00141 } 00142 00143 uint32 feature_count = file_sequence.getFeatureListLength(); 00144 uint32 cds_count = 0; 00145 00146 //define a sequence for each type of data 00147 gnSequence exon_seq; 00148 gnSequence intron_seq; 00149 00150 list<gnLocation> exon_list; 00151 list<gnLocation> intron_list; 00152 00153 cout << "There are " << feature_count << " known features.\n"; 00154 for(uint32 featureI = 0; featureI < feature_count; featureI++){ 00155 gnBaseFeature* cur_feat = file_sequence.getFeature(featureI); 00156 if(cur_feat == NULL){ 00157 cout << "cur_feat is NULL, trace me!\n"; 00158 cur_feat = file_sequence.getFeature(featureI); 00159 continue; 00160 } 00161 if(cur_feat->GetName() == "CDS"){ 00162 cds_count++; 00163 uint32 loc_count = cur_feat->GetLocationListLength(); 00164 vector<gnLocation> cur_exon_list, cur_intron_list; 00165 00166 for(uint32 locationI = 0; locationI < loc_count; locationI++){ 00167 cur_exon_list.push_back(cur_feat->GetLocation(locationI)); 00168 exon_list.push_back(cur_exon_list[locationI]); 00169 } 00170 if(loc_count >= 2){ 00171 cur_intron_list = cur_exon_list; 00172 ComplementLocation(cur_intron_list); 00173 for(uint32 locationI = 0; locationI < cur_intron_list.size(); locationI++) 00174 intron_list.push_back(cur_intron_list[locationI]); 00175 } 00176 /* Code for exon size tracking */ 00177 /* sort(cur_exon_list.begin(), cur_exon_list.end(), &LocationSizeLessthan); 00178 gnSeqI cur_size = cur_exon_list[loc_count - 1].GetEnd() - cur_exon_list[loc_count - 1].GetStart(); 00179 size_out << cur_size << '\n'; 00180 if(cur_size > 5000){ 00181 cout << "psycho bob is at it again\n"; 00182 cout << "Found " << cur_size << " length exon in feature: " << cur_feat->GetName() << "\n"; 00183 cout << "Feature Index: " << featureI << "\n"; 00184 for(uint32 i=0; i < cur_feat->GetQualifierListLength(); i++){ 00185 cout << cur_feat->GetQualifierName(i) << "\t" << cur_feat->GetQualifierValue(i) << "\n"; 00186 } 00187 cout << "\n"; 00188 } */ 00189 delete cur_feat; 00190 } 00191 } 00192 00193 cout << "There are " << cds_count << " cds features.\n"; 00194 00195 //now sort the lists and delete repeats and fix overlaps 00196 uint32 exon_count = exon_list.size(); 00197 cout << "Deleting exon repeats... "; 00198 DeleteRepeats(exon_list); 00199 cout << "Eliminated " << exon_count - exon_list.size() << " repeats for a total of "; 00200 cout << exon_list.size() << " repeats\n"; 00201 uint32 intron_count = intron_list.size(); 00202 cout << "Deleting intron repeats... "; 00203 DeleteRepeats(intron_list); 00204 cout << "Eliminated " << intron_count - intron_list.size() << " repeats for a total of "; 00205 cout << intron_list.size() << " repeats\n"; 00206 WriteList(exon_list, exon_list_filename); 00207 WriteList(intron_list, intron_list_filename); 00208 WriteData(file_sequence, exon_list, exon_seq_filename); 00209 WriteData(file_sequence, intron_list, intron_seq_filename); 00210 if(run_interactive) 00211 cin >> seq_filename; 00212 return 0; 00213 }; Generated at Fri Nov 30 15:36:52 2001 for libGenome by 1.2.8.1 written by Dimitri van Heesch, © 1997-2001 |