NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
bam_format.h
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #pragma once
29 
30 #include <nvbio/basic/types.h>
31 
32 namespace nvbio {
33 namespace io {
34 
35 // BAM format description: http://samtools.sourceforge.net/SAM1.pdf
36 // note that we don't actually use BGZF here, just plain gzip since we're reading everything sequentially
37 // (in other words, we don't use BAM indices)
38 
39 // the BAM header section
40 struct BAM_header
41 {
42  // header section
43  uint8 magic[4]; // BAM magic string
44  int32 l_text; // length of the header text
45  // header text comes next; we ignore it
46 
47  // reference sequence section
48  int32 n_ref; // number of reference sequences
49 };
50 
51 // BAM reference sequence section
53 {
54  int32 l_name; // length of the reference name + 1 (including null)
55  // reference sequence name goes here (null-terminated)
56  int32 l_ref; // length of the reference sequence
57 };
58 
59 // BAM alignment section
61 {
62  int32 block_size; // length of the remainder of the alignment record
63  int32 refID; // reference sequence ID, -1 <= refID < n_ref (-1 for a read without a mapping position)
64  int32 pos; // 0-based leftmost coordinate
65  uint32 bin_mq_nl; // bin << 16 | MAPQ << 8 | l_read_name
66  uint32 flag_nc; // FLAG << 16 | n_cigar_op
67  int32 l_seq; // length of the sequence
68  int32 next_refID; // refID of the next segment (-1 <= next_refID < n_ref)
69  int32 next_pos; // 0-based leftmost pos of the next segment
70  int32 tlen; // template length
71 
72  // BAM_alignment_data_block follows
73 };
74 
76 {
77  const char *name; // read name, NULL terminated
78  uint32 cigar[1024]; // CIGAR string, encoded as op_len << 4 | op ; 'MIDNSHP=X' -> 012345678
79  uint8 seq[1024]; // 4-bit encoded read: '=ACMGRSVTWYHKDBN' -> [0, 15]; other characters mapped to 'N'
80  // high nibble first: 1st base in the highest 4-bit of the 1st byte
81  uint8 qual[1024]; // Phred-base quality (a sequence of 0xFF if absent)
82 
83  // our own additional data, output as tags (only if read is mapped)
84  int32 ed; // NM:i
85  int32 score; // AS:i
86  int32 second_score; // XS:i (optional)
87  int32 mm; // XM:i
88  int32 gapo; // XO:i
89  int32 gape; // XG:i
90  char md_string[4096]; // MD:Z (mostly optional?)
91 
92  // extra data that's useful but not written out
93  bool second_score_valid; // do we have a second score?
94 };
95 
96 } // namespace io
97 } // namespace nvbio