NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
bnt.cpp
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include <nvbio/basic/bnt.h>
29 
30 namespace nvbio {
31 
32 //
33 // NOTE: the code below is a derivative of bntseq.h, originally distributed
34 // under the MIT License, Copyright (c) 2008 Genome Research Ltd (GRL).
35 //
36 
37 void save_bns(const BNTSeq& bns, const char *prefix)
38 {
39  //
40  // save the BNT sequence information to two distinct files,
41  // one for annotations (.ann) and one for amiguities (.amb).
42  // The file formats are taken from BWA 0.6.1.
43  //
44 
45  { // dump .ann
46  std::string filename = std::string( prefix ) + ".ann";
47  FILE* file = fopen( filename.c_str(), "w" );
48  if (file == NULL)
49  throw bns_fopen_failure();
50 
51  fprintf( file, "%lld %d %u\n", (long long)bns.l_pac, bns.n_seqs, bns.seed );
52 
53  for (int32 i = 0; i != bns.n_seqs; ++i)
54  {
55  const BNTAnnData& ann_data = bns.anns_data[i];
56  const BNTAnnInfo& ann_info = bns.anns_info[i];
57 
58  fprintf(file, "%d %s", ann_data.gi, ann_info.name.c_str());
59  if (ann_info.anno.length())
60  fprintf(file, " %s\n", ann_info.anno.c_str());
61  else
62  fprintf(file, "\n");
63 
64  fprintf( file, "%lld %d %d\n", (long long)ann_data.offset, ann_data.len, ann_data.n_ambs );
65  }
66  fclose( file );
67  }
68  { // dump .amb
69  std::string filename = std::string( prefix ) + ".amb";
70  FILE* file = fopen( filename.c_str(), "w" );
71  if (file == NULL)
72  throw bns_fopen_failure();
73 
74  fprintf( file, "%lld %d %u\n", (long long)bns.l_pac, bns.n_seqs, bns.n_holes );
75  for (int32 i = 0; i != bns.n_holes; ++i)
76  {
77  const BNTAmb& amb = bns.ambs[i];
78  fprintf( file, "%lld %d %c\n", (long long)amb.offset, amb.len, amb.amb );
79  }
80  fclose( file );
81  }
82 }
83 void load_bns(BNTSeq& bns, const char *prefix)
84 {
85  //
86  // load the BNT sequence information from two distinct files,
87  // one for annotations (.ann) and one for amiguities (.amb).
88  // The file formats are taken from BWA 0.6.1.
89  //
90 
91  { // read .ann
92  std::string filename = std::string( prefix ) + ".ann";
93  FILE* file = fopen( filename.c_str(), "r" );
94  if (file == NULL)
95  throw bns_fopen_failure();
96 
97  fscanf( file, "%lld%d%u\n", &bns.l_pac, &bns.n_seqs, &bns.seed );
98 
99  bns.anns_data.resize( bns.n_seqs );
100  bns.anns_info.resize( bns.n_seqs );
101 
102  for (int32 i = 0; i != bns.n_seqs; ++i)
103  {
104  BNTAnnData& ann_data = bns.anns_data[i];
105  BNTAnnInfo& ann_info = bns.anns_info[i];
106 
107  char buffer[2048];
108 
109  // read gi and name
110  fscanf( file, "%d%s", &ann_data.gi, buffer );
111 
112  ann_info.name = buffer;
113 
114  // read fasta comments
115  {
116  char* p = buffer;
117  char c;
118  while ((c = fgetc( file )) != '\n' && c != EOF)
119  *p++ = c;
120 
121  *p = '\0';
122 
123  // skip leading spaces
124  for (p = buffer; *p == ' '; ++p) {}
125 
126  ann_info.anno = p;
127  }
128 
129  fscanf(file, "%lld%d%d\n", &ann_data.offset, &ann_data.len, &ann_data.n_ambs);
130  }
131  fclose( file );
132  }
133  { // read .amb
134  std::string filename = std::string( prefix ) + ".amb";
135  FILE* file = fopen( filename.c_str(), "r" );
136  if (file == NULL)
137  throw bns_fopen_failure();
138 
139  try
140  {
141  int64 l_pac;
142  int32 n_seqs;
143  fscanf( file, "%lld%d%u\n", &l_pac, &n_seqs, &bns.n_holes );
144 
145  if (l_pac != bns.l_pac || n_seqs != bns.n_seqs)
146  throw bns_files_mismatch();
147 
148  bns.ambs.resize( bns.n_holes );
149 
150  for (int32 i = 0; i != bns.n_holes; ++i)
151  {
152  BNTAmb& amb = bns.ambs[i];
153  fscanf( file, "%lld%d%c\n", &amb.offset, &amb.len, &amb.amb );
154  }
155  fclose( file );
156  }
157  catch (...)
158  {
159  fclose( file );
160  throw;
161  }
162  }
163 }
164 
165 void load_bns_info(BNTInfo& bns, const char *prefix)
166 {
167  { // read .ann
168  std::string filename = std::string( prefix ) + ".ann";
169  FILE* file = fopen( filename.c_str(), "r" );
170  if (file == NULL)
171  throw bns_fopen_failure();
172 
173  fscanf( file, "%lld%d%u\n", &bns.l_pac, &bns.n_seqs, &bns.seed );
174  }
175  { // read .amb
176  std::string filename = std::string( prefix ) + ".amb";
177  FILE* file = fopen( filename.c_str(), "r" );
178  if (file == NULL)
179  throw bns_fopen_failure();
180 
181  try
182  {
183  int64 l_pac;
184  int32 n_seqs;
185  fscanf( file, "%lld%d%u\n", &l_pac, &n_seqs, &bns.n_holes );
186 
187  if (l_pac != bns.l_pac || n_seqs != bns.n_seqs)
188  throw bns_files_mismatch();
189  }
190  catch (...)
191  {
192  fclose( file );
193  throw;
194  }
195  }
196 }
197 void load_bns(BNTSeqLoader* bns, const char *prefix)
198 {
199  //
200  // load the BNT sequence information from two distinct files,
201  // one for annotations (.ann) and one for amiguities (.amb).
202  // The file formats are taken from BWA 0.6.1.
203  //
204 
205  BNTInfo info;
206 
207  load_bns_info( info, prefix );
208 
209  bns->set_info( info );
210 
211  { // read .ann
212  std::string filename = std::string( prefix ) + ".ann";
213  FILE* file = fopen( filename.c_str(), "r" );
214  if (file == NULL)
215  throw bns_fopen_failure();
216 
217  int64 l_pac;
218  int32 n_seqs;
219  uint32 seed;
220  fscanf( file, "%lld%d%u\n", &l_pac, &n_seqs, &seed );
221 
222  BNTAnnData ann_data;
223  BNTAnnInfo ann_info;
224 
225  for (int32 i = 0; i != n_seqs; ++i)
226  {
227  char buffer[2048];
228 
229  // read gi and name
230  fscanf( file, "%d%s", &ann_data.gi, buffer );
231 
232  ann_info.name = buffer;
233 
234  // read fasta comments
235  {
236  char* p = buffer;
237  char c;
238  while ((c = fgetc( file )) != '\n' && c != EOF)
239  *p++ = c;
240 
241  *p = '\0';
242 
243  // skip leading spaces
244  for (p = buffer; *p == ' '; ++p) {}
245 
246  ann_info.anno = p;
247  }
248 
249  fscanf(file, "%lld%d%d\n", &ann_data.offset, &ann_data.len, &ann_data.n_ambs);
250  bns->read_ann( ann_info, ann_data );
251  }
252  fclose( file );
253  }
254  { // read .amb
255  std::string filename = std::string( prefix ) + ".amb";
256  FILE* file = fopen( filename.c_str(), "r" );
257  if (file == NULL)
258  throw bns_fopen_failure();
259 
260  try
261  {
262  int64 l_pac;
263  int32 n_seqs;
264  int32 n_holes;
265  fscanf( file, "%lld%d%u\n", &l_pac, &n_seqs, &n_holes );
266 
267  BNTAmb amb;
268  for (int32 i = 0; i != n_holes; ++i)
269  {
270  fscanf( file, "%lld%d%c\n", &amb.offset, &amb.len, &amb.amb );
271  bns->read_amb( amb );
272  }
273  fclose( file );
274  }
275  catch (...)
276  {
277  fclose( file );
278  throw;
279  }
280  }
281 }
282 
283 } // namespace nvbio