NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
sequence_bam.cpp
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include <stdlib.h>
29 #include <string.h>
30 #include <zlib/zlib.h>
31 
32 #include <nvbio/basic/console.h>
37 
38 namespace nvbio {
39 namespace io {
40 
43 
46 
49 
51  const char* read_file_name,
52  const SequenceDataFile::Options& options)
53  : SequenceDataFile( options )
54 {
55  fp = gzopen(read_file_name, "rb");
56  if (fp == NULL)
57  {
58  // this will cause init() to fail below
59  log_error(stderr, "unable to open BAM file %s\n", read_file_name);
61  } else {
63  }
64 }
65 
66 bool SequenceDataFile_BAM::readData(void *output, unsigned int len)
67 {
68  unsigned int ret;
69 
70  ret = gzread(fp, output, len);
71  if (ret > 0)
72  {
73  return true;
74  } else {
75  // check for EOF separately; zlib will not always return Z_STREAM_END at EOF below
76  if (gzeof(fp))
77  {
79  } else {
80  // ask zlib what happened and inform the user
81  int err;
82  const char *msg;
83 
84  msg = gzerror(fp, &err);
85  // we're making the assumption that we never see Z_STREAM_END here
86  assert(err != Z_STREAM_END);
87 
88  log_error(stderr, "error processing BAM file: zlib error %d (%s)\n", err, msg);
90  }
91 
92  return false;
93  }
94 }
95 
96 // read in a structure field from fp
97 // returns error (local variable of the right type) if read fails
98 #define GZREAD(field) \
99  if (readData(&(field), sizeof(field)) == false) { \
100  return error; \
101  }
102 
103 // skip bytes in fp
104 // note that gzseek won't actually detect EOF, so we don't check return values here
105 #define GZFWD(bytes) \
106  gzseek(fp, (bytes), SEEK_CUR)
107 
108 // skip a structure field in fp
109 #define GZSKIP(field) \
110  gzseek(fp, sizeof(field), SEEK_CUR)
111 
113 {
114  // error constant for GZREAD
115  static const bool error = false;
116 
118  int c;
119 
120  if (fp == NULL)
121  {
122  // file failed to open
123  return false;
124  }
125 
126  // parse the BAM header
127  GZREAD(header.magic);
128 
129  if (header.magic[0] != 'B' ||
130  header.magic[1] != 'A' ||
131  header.magic[2] != 'M' ||
132  header.magic[3] != '\1')
133  {
134  log_error(stderr, "error parsing BAM file (invalid magic)\n");
136  return false;
137  }
138 
139  // read in header text length and skip header text
140  GZREAD(header.l_text);
141  GZFWD(header.l_text);
142 
143  // skip reference sequence data
144  GZREAD(header.n_ref);
145  for(c = 0; c < header.n_ref; c++)
146  {
147  BAM_reference ref;
148  GZREAD(ref.l_name);
149  GZFWD(ref.l_name + sizeof(ref.l_ref));
150  }
151 
152  return true;
153 }
154 
155 namespace {
156 
157 // decode a BAM bp into ascii
158 inline unsigned char decode_BAM_bp(uint8 bp)
159 {
160  static const char table[] = "=ACMGRSVTWYHKDBN";
161 
162  assert(bp < 16);
163  return table[bp];
164 }
165 
166 }
167 
168 // rewind
169 //
171 {
172  if (fp == NULL)
173  return false;
174 
175  gzrewind( fp );
176 
178  return init();
179 }
180 
181 // grab the next chunk of reads from the file, up to max_reads
183 {
184  if (max_bps < SequenceDataFile::LONG_READ)
185  return 0;
186 
187  // error code for gzread
188  static const int error = -1;
189 
190  // small utility structure to keep track of data buffers and free them as appropriate
191  // (note the slightly evil initializer that sets all pointers to NULL)
192  struct data
193  {
194  char *read_name;
195  uint8 *encoded_read;
196  uint8 *decoded_read;
197  uint8 *quality;
198 
199  ~data()
200  {
201  if (read_name)
202  free(read_name);
203 
204  if (encoded_read)
205  free(encoded_read);
206 
207  if (decoded_read)
208  free(decoded_read);
209 
210  if (quality)
211  free(quality);
212  }
213  } data = { };
214 
215  // utility structure to keep track of alignment header data
217 
218  z_off_t read_block_start;
219  int read_name_len;
220  int read_flags;
221  int cigar_len;
222  int encoded_read_len;
223 
224  int c;
225 
226  // are we done?
227  if (gzeof(fp))
228  {
230  return 0;
231  }
232 
233  // parse and skip all non-primary reads
234  do {
235  // read in the block_size
236  GZREAD(align.block_size);
237 
238  // record the starting file position for this read block
239  read_block_start = gztell(fp);
240 
241  // skip uninsteresting fields
242  GZFWD(sizeof(align.refID) +
243  sizeof(align.pos));
244 
245  GZREAD(align.bin_mq_nl);
246  GZREAD(align.flag_nc);
247 
248  // compute read flags
249  read_flags = align.flag_nc >> 16;
250  if (read_flags & SAMFlag_SecondaryAlignment)
251  {
252  // we're not interested in this read; skip the remainder of the read block and loop
253  uint32 skip = align.block_size - (gztell(fp) - read_block_start);
254  assert(skip);
255  GZFWD(skip);
256 
257  continue;
258  }
259  } while (read_flags & SAMFlag_SecondaryAlignment);
260 
261  // this is a primary read, so read the read
262  GZREAD(align.l_seq);
263  GZFWD(sizeof(align.next_refID) +
264  sizeof(align.next_pos) +
265  sizeof(align.tlen));
266 
267  // read in the name (and add a null-terminator just in case)
268  read_name_len = align.bin_mq_nl & 0xff;
269  data.read_name = (char *) malloc(read_name_len + 1);
270  data.read_name[read_name_len] = 0;
271 
272  if (gzread(fp, data.read_name, read_name_len) != read_name_len)
273  {
274  log_error(stderr, "error processing BAM file (could not fetch read name)\n");
276  return 0;
277  }
278 
279  // skip the cigar
280  cigar_len = (align.flag_nc & 0xffff) * sizeof(uint32);
281  GZFWD(cigar_len);
282 
283  // grab the read data
284  encoded_read_len = (align.l_seq + 1) / 2;
285  data.encoded_read = (uint8 *) malloc((align.l_seq + 1) / 2);
286 
287  if (gzread(fp, data.encoded_read, encoded_read_len) != encoded_read_len)
288  {
289  log_error(stderr, "error processing BAM file (could not fetch sequence data)\n");
291  return 0;
292  }
293 
294  // read in the quality data
295  data.quality = (uint8 *) malloc(align.l_seq);
296  if (gzread(fp, data.quality, align.l_seq) != align.l_seq)
297  {
298  log_error(stderr, "error processing BAM file (could not fetch quality data)\n");
300  return 0;
301  }
302 
303  // skip the rest of the read block
304  uint32 skip = align.block_size - (gztell(fp) - read_block_start);
305  GZFWD(skip);
306 
307  // decode the read data into a null-terminated string
308  data.decoded_read = (uint8 *) malloc(align.l_seq + 1);
309  data.decoded_read[align.l_seq] = 0;
310 
311  PackedStream<uint8*, uint8, 4, true> stream(data.encoded_read);
312  for(c = 0; c < align.l_seq; c++)
313  {
314  data.decoded_read[c] = decode_BAM_bp(stream[c]);
315  }
316 
317  if (m_options.flags & FORWARD)
318  {
321 
322  // add the read into the batch
323  output->push_back(align.l_seq,
324  data.read_name,
325  data.decoded_read,
326  data.quality,
327  Phred,
331  op );
332  }
333  if (m_options.flags & REVERSE)
334  {
337 
338  output->push_back(align.l_seq,
339  data.read_name,
340  data.decoded_read,
341  data.quality,
342  Phred,
346  op );
347  }
349  {
352 
353  output->push_back(align.l_seq,
354  data.read_name,
355  data.decoded_read,
356  data.quality,
357  Phred,
361  op );
362  }
364  {
367 
368  output->push_back(align.l_seq,
369  data.read_name,
370  data.decoded_read,
371  data.quality,
372  Phred,
376  op );
377  }
378 
379  // we always return 1 read at a time
380  return 1;
381 }
382 
386 
387 } // namespace io
388 } // namespace nvbio