NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
bam.cpp
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include <stdlib.h>
29 #include <string.h>
30 #include <zlib/zlib.h>
31 
32 #include <nvbio/basic/console.h>
34 #include <nvbio/io/reads/bam.h>
35 #include <nvbio/io/reads/sam.h>
36 
37 namespace nvbio {
38 namespace io {
39 
42 
45 
48 
49 ReadDataFile_BAM::ReadDataFile_BAM(const char *read_file_name,
50  const uint32 max_reads,
51  const uint32 truncate_read_len,
52  const ReadEncoding flags)
53  : ReadDataFile(max_reads, truncate_read_len, flags)
54 {
55  fp = gzopen(read_file_name, "rb");
56  if (fp == NULL)
57  {
58  // this will cause init() to fail below
59  log_error(stderr, "unable to open BAM file %s\n", read_file_name);
61  } else {
63  }
64 }
65 
66 bool ReadDataFile_BAM::readData(void *output, unsigned int len)
67 {
68  unsigned int ret;
69 
70  ret = gzread(fp, output, len);
71  if (ret > 0)
72  {
73  return true;
74  } else {
75  // check for EOF separately; zlib will not always return Z_STREAM_END at EOF below
76  if (gzeof(fp))
77  {
79  } else {
80  // ask zlib what happened and inform the user
81  int err;
82  const char *msg;
83 
84  msg = gzerror(fp, &err);
85  // we're making the assumption that we never see Z_STREAM_END here
86  assert(err != Z_STREAM_END);
87 
88  log_error(stderr, "error processing BAM file: zlib error %d (%s)\n", err, msg);
90  }
91 
92  return false;
93  }
94 }
95 
96 // read in a structure field from fp
97 // returns error (local variable of the right type) if read fails
98 #define GZREAD(field) \
99  if (readData(&(field), sizeof(field)) == false) { \
100  return error; \
101  }
102 
103 // skip bytes in fp
104 // note that gzseek won't actually detect EOF, so we don't check return values here
105 #define GZFWD(bytes) \
106  gzseek(fp, (bytes), SEEK_CUR)
107 
108 // skip a structure field in fp
109 #define GZSKIP(field) \
110  gzseek(fp, sizeof(field), SEEK_CUR)
111 
113 {
114  // error constant for GZREAD
115  static const bool error = false;
116 
118  int c;
119 
120  if (fp == NULL)
121  {
122  // file failed to open
123  return false;
124  }
125 
126  // parse the BAM header
127  GZREAD(header.magic);
128 
129  if (header.magic[0] != 'B' ||
130  header.magic[1] != 'A' ||
131  header.magic[2] != 'M' ||
132  header.magic[3] != '\1')
133  {
134  log_error(stderr, "error parsing BAM file (invalid magic)\n");
136  return false;
137  }
138 
139  // read in header text length and skip header text
140  GZREAD(header.l_text);
141  GZFWD(header.l_text);
142 
143  // skip reference sequence data
144  GZREAD(header.n_ref);
145  for(c = 0; c < header.n_ref; c++)
146  {
147  BAM_reference ref;
148  GZREAD(ref.l_name);
149  GZFWD(ref.l_name + sizeof(ref.l_ref));
150  }
151 
152  return true;
153 }
154 
155 namespace {
156 
157 // decode a BAM bp into ascii
158 inline unsigned char decode_BAM_bp(uint8 bp)
159 {
160  static const char table[] = "=ACMGRSVTWYHKDBN";
161 
162  assert(bp < 16);
163  return table[bp];
164 }
165 
166 }
167 
168 // grab the next chunk of reads from the file, up to max_reads
169 int ReadDataFile_BAM::nextChunk(ReadDataRAM *output, uint32 max_reads, uint32 max_bps)
170 {
171  if (max_bps < ReadDataFile::LONG_READ)
172  return 0;
173 
174  // error code for gzread
175  static const int error = -1;
176 
177  // small utility structure to keep track of data buffers and free them as appropriate
178  // (note the slightly evil initializer that sets all pointers to NULL)
179  struct data
180  {
181  char *read_name;
182  uint8 *encoded_read;
183  uint8 *decoded_read;
184  uint8 *quality;
185 
186  ~data()
187  {
188  if (read_name)
189  free(read_name);
190 
191  if (encoded_read)
192  free(encoded_read);
193 
194  if (decoded_read)
195  free(decoded_read);
196 
197  if (quality)
198  free(quality);
199  }
200  } data = { };
201 
202  // utility structure to keep track of alignment header data
204 
205  z_off_t read_block_start;
206  int read_name_len;
207  int read_flags;
208  int cigar_len;
209  int encoded_read_len;
210 
211  int c;
212 
213  // are we done?
214  if (gzeof(fp))
215  {
217  return 0;
218  }
219 
220  // parse and skip all non-primary reads
221  do {
222  // read in the block_size
223  GZREAD(align.block_size);
224 
225  // record the starting file position for this read block
226  read_block_start = gztell(fp);
227 
228  // skip uninsteresting fields
229  GZFWD(sizeof(align.refID) +
230  sizeof(align.pos));
231 
232  GZREAD(align.bin_mq_nl);
233  GZREAD(align.flag_nc);
234 
235  // compute read flags
236  read_flags = align.flag_nc >> 16;
237  if (read_flags & SAMFlag_SecondaryAlignment)
238  {
239  // we're not interested in this read; skip the remainder of the read block and loop
240  uint32 skip = align.block_size - (gztell(fp) - read_block_start);
241  assert(skip);
242  GZFWD(skip);
243 
244  continue;
245  }
246  } while (read_flags & SAMFlag_SecondaryAlignment);
247 
248  // this is a primary read, so read the read
249  GZREAD(align.l_seq);
250  GZFWD(sizeof(align.next_refID) +
251  sizeof(align.next_pos) +
252  sizeof(align.tlen));
253 
254  // read in the name (and add a null-terminator just in case)
255  read_name_len = align.bin_mq_nl & 0xff;
256  data.read_name = (char *) malloc(read_name_len + 1);
257  data.read_name[read_name_len] = 0;
258 
259  if (gzread(fp, data.read_name, read_name_len) != read_name_len)
260  {
261  log_error(stderr, "error processing BAM file (could not fetch read name)\n");
263  return 0;
264  }
265 
266  // skip the cigar
267  cigar_len = (align.flag_nc & 0xffff) * sizeof(uint32);
268  GZFWD(cigar_len);
269 
270  // grab the read data
271  encoded_read_len = (align.l_seq + 1) / 2;
272  data.encoded_read = (uint8 *) malloc((align.l_seq + 1) / 2);
273 
274  if (gzread(fp, data.encoded_read, encoded_read_len) != encoded_read_len)
275  {
276  log_error(stderr, "error processing BAM file (could not fetch sequence data)\n");
278  return 0;
279  }
280 
281  // read in the quality data
282  data.quality = (uint8 *) malloc(align.l_seq);
283  if (gzread(fp, data.quality, align.l_seq) != align.l_seq)
284  {
285  log_error(stderr, "error processing BAM file (could not fetch quality data)\n");
287  return 0;
288  }
289 
290  // skip the rest of the read block
291  uint32 skip = align.block_size - (gztell(fp) - read_block_start);
292  GZFWD(skip);
293 
294  // decode the read data into a null-terminated string
295  data.decoded_read = (uint8 *) malloc(align.l_seq + 1);
296  data.decoded_read[align.l_seq] = 0;
297 
298  PackedStream<uint8*, uint8, 4, true> stream(data.encoded_read);
299  for(c = 0; c < align.l_seq; c++)
300  {
301  data.decoded_read[c] = decode_BAM_bp(stream[c]);
302  }
303 
304  if (m_flags & FORWARD)
305  {
306  const ReadDataRAM::StrandOp op = (read_flags & SAMFlag_ReverseComplemented) ?
308 
309  // add the read into the batch
310  output->push_back(align.l_seq,
311  data.read_name,
312  data.decoded_read,
313  data.quality,
314  Phred,
316  op );
317  }
318  if (m_flags & REVERSE)
319  {
320  const ReadDataRAM::StrandOp op = (read_flags & SAMFlag_ReverseComplemented) ?
322 
323  output->push_back(align.l_seq,
324  data.read_name,
325  data.decoded_read,
326  data.quality,
327  Phred,
329  op );
330  }
332  {
333  const ReadDataRAM::StrandOp op = (read_flags & SAMFlag_ReverseComplemented) ?
335 
336  output->push_back(align.l_seq,
337  data.read_name,
338  data.decoded_read,
339  data.quality,
340  Phred,
342  op );
343  }
345  {
346  const ReadDataRAM::StrandOp op = (read_flags & SAMFlag_ReverseComplemented) ?
348 
349  output->push_back(align.l_seq,
350  data.read_name,
351  data.decoded_read,
352  data.quality,
353  Phred,
355  op );
356  }
357 
358  // we always return 1 read at a time
359  return 1;
360 }
361 
365 
366 } // namespace io
367 } // namespace nvbio