NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
bam_io.h
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #pragma once
29 
30 #include <htslib/sam.h>
31 #include <htslib/hts.h>
32 #include <htslib/bgzf.h>
33 
34 #include "assembly_types.h"
35 
36 using namespace nvbio;
37 
40 // BAM header
41 struct BAM_header
42 {
43  uint8 magic[4]; // BAM magic string
44  int32 l_text; // length of the header text
45  std::string text; // the header text itself
46  int32 n_ref; // number of reference sequences
47  std::vector<std::string> sq_names; // reference sequence names
48  H_VectorU32 sq_lengths; // reference sequence lengths
49 };
50 
51 // BAM alignment record header
53 {
54  int32 block_size; // length of the remainder of the alignment record
55  int32 refID; // reference sequence ID, -1 <= refID < n_ref (-1 for a read without a mapping position)
56  int32 pos; // 0-based leftmost coordinate
57  uint32 bin_mq_nl; // bin << 16 | MAPQ << 8 | l_read_name
58  uint32 flag_nc; // FLAG << 16 | n_cigar_op
59  int32 l_seq; // length of the sequence
60  int32 next_refID; // refID of the next segment (-1 <= next_refID < n_ref)
61  int32 next_pos; // 0-based leftmost pos of the next segment
62  int32 tlen; // template length
63 
64  uint32 bin(void) const { return bin_mq_nl >> 16; }
65  uint32 mapq(void) const { return (bin_mq_nl & 0xff00) >> 8; }
66  uint32 l_read_name(void) const { return bin_mq_nl & 0xff; }
67  uint32 flags(void) const { return flag_nc >> 16; }
68  uint32 num_cigar_ops(void) const { return flag_nc & 0xffff; }
69 };
70 
71 // BAM alignment record
72 // consists of the alignment header followed by a variable-length
73 // data block; the auxiliary data is of block_size - ftell() bytes
75 {
76  BAM_alignment_header header; // alignment info
77  H_VectorU8 data; // qname + cigar + seq + qual + aux
78 };
79 
80 // contiguous batch of raw alignment records
82 {
83  H_VectorU8 recs; // raw byte array of alignment records
84  H_VectorU64 offsets; // offsets into alignment records (num_recs + 1)
85 };
86 
87 typedef enum
88 {
89  BAM_NAMES = 1,
91  BAM_READS = 4,
93  BAM_FLAGS = 16,
95  BAM_REFIDS = 64,
96  BAM_MAPQ = 128,
97  BAM_AUX = 256,
98  BAM_BIN = 512,
99  BAM_ALL = 0xFFFF
101 
102 typedef enum
103 {
116 
117 
118 // ----- SoA Functionality ------
119 
120 // CRQ: cigars, reads, qualities
122 {
123  uint64 cigar_start, cigar_len;
124  uint64 read_start, read_len;
125  uint64 qual_start, qual_len;
126 
127  BAM_CRQ_index(): cigar_start(0), cigar_len(0), read_start(0), read_len(0), qual_start(0), qual_len(0) { }
128  BAM_CRQ_index(uint64 cigar_start, uint64 cigar_len, uint64 read_start, uint64 read_len, uint64 qual_start, uint64 qual_len)
129  : cigar_start(cigar_start), cigar_len(cigar_len), read_start(read_start), read_len(read_len),
130  qual_start(qual_start), qual_len(qual_len) { }
131 };
132 
134 {
138 
139  BAM_NAUX_index() : aux_data_start(0), aux_data_len(0), name_start(0) { }
140  BAM_NAUX_index(uint64 aux_data_start, uint32 aux_data_len, uint64 name_start)
141  : aux_data_start(aux_data_start), aux_data_len(aux_data_len), name_start(name_start) { }
142 };
143 
144 #define H_BATCH_SIZE_ALLOC 10000000U
145 #define ALNREC_SIZE_ALLOC 512
146 
149 
150 // batch of alignment records as SoA
151 // host-only now
153 {
154  BAM_alignment_batch_SoA(): field_mask(BAM_ALL), num_alns(0) { reserve(); }
155  BAM_alignment_batch_SoA(uint32 fields): field_mask(fields), num_alns(0) { reserve(); }
156 
157  uint32 field_mask; // record fields stored
171  nvbio::vector<host_tag, BAM_CRQ_index> crq_index; // CRQ: cigars, reads, qualities
173 
174  void reserve()
175  {
176  if(field_mask & BAM_POSITIONS) positions.reserve(H_BATCH_SIZE_ALLOC);
177  if(field_mask & BAM_REFIDS) refIDs.reserve(H_BATCH_SIZE_ALLOC);
178  if(field_mask & BAM_BIN) bin.reserve(H_BATCH_SIZE_ALLOC);
179  if(field_mask & BAM_MAPQ) mapq.reserve(H_BATCH_SIZE_ALLOC);
180  if(field_mask & BAM_FLAGS) flags.reserve(H_BATCH_SIZE_ALLOC);
181  if(field_mask & (BAM_CIGARS | BAM_QUALITIES | BAM_READS)) crq_index.reserve(H_BATCH_SIZE_ALLOC);
182  if(field_mask & BAM_CIGARS) cigars.reserve(H_BATCH_SIZE_ALLOC * 32);
183  if(field_mask & BAM_READS) reads.reserve(H_BATCH_SIZE_ALLOC * 125);
184  if(field_mask & BAM_QUALITIES) qualities.reserve(H_BATCH_SIZE_ALLOC * 125);
185  if(field_mask & (BAM_AUX | BAM_NAMES)) aln_index.reserve(H_BATCH_SIZE_ALLOC);
186  if(field_mask & BAM_AUX) aux_data.reserve(H_BATCH_SIZE_ALLOC * 64);
187  if(field_mask & BAM_NAMES) names.reserve(H_BATCH_SIZE_ALLOC * 256);
188  if(field_mask == BAM_ALL) {
189  next_positions.reserve(H_BATCH_SIZE_ALLOC);
190  next_refIDs.reserve(H_BATCH_SIZE_ALLOC);
191  }
192  }
193 
194  void allocate(const BAM_alignment_batch_SoA& size_batch)
195  {
196  if(field_mask & BAM_POSITIONS) positions.resize(size_batch.positions.size());
197  if(field_mask & BAM_REFIDS) refIDs.resize(size_batch.refIDs.size());
198  if(field_mask & BAM_BIN) bin.resize(size_batch.bin.size());
199  if(field_mask & BAM_MAPQ) mapq.resize(size_batch.mapq.size());
200  if(field_mask & BAM_FLAGS) flags.resize(size_batch.flags.size());
201  if(field_mask & (BAM_CIGARS | BAM_QUALITIES | BAM_READS)) crq_index.resize(size_batch.crq_index.size());
202  if(field_mask & BAM_CIGARS) cigars.resize(size_batch.cigars.size());
203  if(field_mask & BAM_READS) reads.resize(size_batch.reads.size());
204  if(field_mask & BAM_QUALITIES) qualities.resize(size_batch.qualities.size());
205  if(field_mask & (BAM_AUX | BAM_NAMES)) aln_index.resize(size_batch.aln_index.size());
206  if(field_mask & BAM_AUX) aux_data.resize(size_batch.aux_data.size());
207  if(field_mask & BAM_NAMES) names.resize(size_batch.names.size());
208  if(field_mask == BAM_ALL) {
209  next_positions.resize(size_batch.next_positions.size());
210  next_refIDs.resize(size_batch.next_refIDs.size());
211  }
212  }
213 
214  void reset()
215  {
216  num_alns = 0;
217  positions.clear();
218  refIDs.clear();
219  bin.clear();
220  mapq.clear();
221  flags.clear();
222  crq_index.clear();
223  cigars.clear();
224  reads.clear();
225  qualities.clear();
226  aln_index.clear();
227  aux_data.clear();
228  names.clear();
229  next_positions.clear();
230  next_refIDs.clear();
231  }
232 
233  void free()
234  {
235  // force the memory to be freed by swapping
236  positions = H_VectorU32();
237  refIDs = H_VectorU32();
238  bin = H_VectorU16();
239  mapq = H_VectorU8();
240  flags = H_VectorU16();
241  cigars = H_VectorCigarOp();
242  reads = H_VectorDNA16();
243  qualities = H_VectorU8();
244  aux_data = H_VectorU8();
245  names = H_VectorU8();
248  next_positions = H_VectorU32();
249  next_refIDs = H_VectorU32();
250  }
251 
252  void shuffle(BAM_alignment_batch_SoA& batch_out, H_VectorU64 shuffle_idx) {
253  batch_out.allocate(*this);
254 
255  if(field_mask & BAM_POSITIONS) {
256  thrust::gather(shuffle_idx.begin(), shuffle_idx.end(), positions.begin(), batch_out.positions.begin());
257  }
258  if(field_mask & BAM_REFIDS) {
259  thrust::gather(shuffle_idx.begin(), shuffle_idx.end(), refIDs.begin(), batch_out.refIDs.begin());
260  }
261  if(field_mask & BAM_BIN) {
262  thrust::gather(shuffle_idx.begin(), shuffle_idx.end(), bin.begin(), batch_out.bin.begin());
263  }
264  if(field_mask & BAM_MAPQ) {
265  thrust::gather(shuffle_idx.begin(), shuffle_idx.end(), mapq.begin(), batch_out.mapq.begin());
266  }
267  if(field_mask & BAM_FLAGS) {
268  thrust::gather(shuffle_idx.begin(), shuffle_idx.end(), flags.begin(), batch_out.flags.begin());
269  }
270  if(field_mask & BAM_QUALITIES) {
271  uint64 cigar_offset = 0;
272  for(uint64 i = 0; i < num_alns; i++) {
273  uint64 idx = shuffle_idx[i];
274  thrust::copy_n(cigars.begin() + crq_index[idx].cigar_start, crq_index[idx].cigar_len,
275  batch_out.cigars.begin() + cigar_offset);
276  batch_out.crq_index[i].cigar_start = cigar_offset;
277  batch_out.crq_index[i].cigar_len = crq_index[idx].cigar_len;
278  cigar_offset += crq_index[idx].cigar_len;
279  }
280  }
281  if(field_mask & BAM_READS) {
282  uint64 read_offset = 0;
283  for(uint64 i = 0; i < num_alns; i++) {
284  uint64 idx = shuffle_idx[i];
285  uint64* data_in = (uint64 *)reads.addrof(crq_index[idx].read_start);
286  uint64* data_out = (uint64 *)batch_out.reads.addrof(read_offset);
287  memcpy(data_out, data_in, crq_index[idx].read_len/2);
288  batch_out.crq_index[i].read_start = read_offset;
289  batch_out.crq_index[i].read_len = crq_index[idx].read_len;
290  const uint32 padded_read_len_bp = ((crq_index[idx].read_len + 7) / 8) * 8;
291  read_offset += padded_read_len_bp;
292  }
293  }
294  if(field_mask & BAM_QUALITIES) {
295  uint64 qual_offset = 0;
296  for(uint64 i = 0; i < num_alns; i++) {
297  uint64 idx = shuffle_idx[i];
298  thrust::copy_n(qualities.begin() + crq_index[idx].qual_start, crq_index[idx].qual_len,
299  batch_out.qualities.begin() + qual_offset);
300  batch_out.crq_index[i].qual_start = qual_offset;
301  batch_out.crq_index[i].qual_len = crq_index[idx].qual_len;
302  qual_offset += crq_index[idx].qual_len;
303  }
304  }
305  if(field_mask & BAM_AUX) {
306  uint64 aux_offset = 0;
307  for(uint64 i = 0; i < num_alns; i++) {
308  uint64 idx = shuffle_idx[i];
309  thrust::copy_n(aux_data.begin() + aln_index[idx].aux_data_start, aln_index[idx].aux_data_len, batch_out.aux_data.begin() + aux_offset);
310  batch_out.aln_index[i].aux_data_start = aux_offset;
311  batch_out.aln_index[i].aux_data_len = aln_index[idx].aux_data_len;
312  aux_offset += aln_index[idx].aux_data_len;
313  }
314  }
315  if(field_mask & BAM_NAMES){
316  uint64 names_offset = 0;
317  for(uint64 i = 0; i < num_alns; i++) {
318  uint64 idx = shuffle_idx[i];
319  uint32 len;
320  if(idx < num_alns-1) {
321  len = aln_index[idx+1].name_start-aln_index[idx].name_start;
322  } else {
323  len = names.size()-aln_index[idx].name_start;
324  }
325  thrust::copy_n(names.begin() + aln_index[idx].name_start, len, batch_out.names.begin() + names_offset);
326  batch_out.aln_index[i].name_start = names_offset;
327  names_offset += len;
328  }
329  }
330  if(field_mask == BAM_ALL) {
331  thrust::gather(shuffle_idx.begin(), shuffle_idx.end(), next_positions.begin(), batch_out.next_positions.begin());
332  thrust::gather(shuffle_idx.begin(), shuffle_idx.end(), next_refIDs.begin(), batch_out.next_refIDs.begin());
333  }
334  }
335 };
336 
337 
340 // htslib-based BAM file reader
342 {
343 public:
345 
346 private:
347  samFile* fp;
348  long fp_offset;
349 
350 public:
351  HTSBAMReader(const char *fname);
352  ~HTSBAMReader();
353 
354  bool read_aln_batch(std::vector<bam1_t*>& batch, const uint64 batch_size = 1000000);
355  bool read_aln_batch(BAM_alignment_batch_SoA& batch, const uint64 batch_size = 1000000);
356  bool read_aln_batch_intv(BAM_alignment_batch_SoA& batch, const uint32 contig = 1u, const uint64 start = 0u, const uint64 end = 1000000);
357 private:
358  bool read_hdr(void);
359  void parse_aln_rec(BAM_alignment_batch_SoA& batch, bam1_t* b, H_VectorU32& cigar_temp);
360 };
361 
364 // htslib-based BAM file writer
366 {
367 private:
368  samFile* fp;
369 
370 public:
371  HTSBAMWriter(const char *fname);
372  ~HTSBAMWriter();
373 
374  void write_hdr(bam_hdr_t* header);
375  void write_aln_batch(std::vector<bam1_t*>& batch, H_VectorU64 shuffle_idx, bam_hdr_t* header);
376  void write_aln_batch(BAM_alignment_batch_SoA& batch, H_VectorU64 shuffle_idx, bam_hdr_t* header);
377 };