NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
output_sam.h
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #pragma once
29 
36 #include <nvbio/basic/threads.h>
37 
38 #include <stdio.h>
39 
40 namespace nvbio {
41 namespace io {
42 
43 struct SamOutput : public OutputFile
44 {
45 private:
46  // SAM alignment flags
47  // these are meant to be bitwised OR'ed together
48  typedef enum {
49  SAM_FLAGS_PAIRED = 1,
50  SAM_FLAGS_PROPER_PAIR = 2,
51  SAM_FLAGS_UNMAPPED = 4,
52  SAM_FLAGS_MATE_UNMAPPED = 8,
53  SAM_FLAGS_REVERSE = 16,
54  SAM_FLAGS_MATE_REVERSE = 32,
55  SAM_FLAGS_READ_1 = 64,
56  SAM_FLAGS_READ_2 = 128,
57  SAM_FLAGS_SECONDARY = 256,
58  SAM_FLAGS_QC_FAILED = 512,
59  SAM_FLAGS_DUPLICATE = 1024
60  } SamAlignmentFlags;
61 
62  // struct to hold a SAM alignment
63  // field names come from http://samtools.sourceforge.net/SAMv1.pdf (page 4)
64  struct SamAlignment
65  {
66  // required fields
67  const char * qname; // query template name
68  uint32 flags; // bitwise alignment flags from SamAlignmentFlags
69  const char * rname; // reference sequence name
70  uint32 pos; // 1-based leftmost mapping position
71  uint8 mapq; // mapping quality
72  char cigar[4096]; // CIGAR string
73  const char * rnext; // reference name of the mate/next read
74  uint32 pnext; // position of the mate/next read
75  int32 tlen; // observed template length
76  char seq[4096]; // segment sequence (xxxnsubtil: size this according to max read len)
77  char qual[4096]; // ASCII of phred-scaled base quality+33 (xxxnsubtil: same as above)
78 
79  // our own additional data, output as tags (only if read is mapped)
80  int32 ed; // NM:i
81  int32 score; // AS:i
82  int32 second_score; // XS:i (optional)
83  int32 mm; // XM:i
84  int32 gapo; // XO:i
85  int32 gape; // XG:i
86  char md_string[4096]; // MD:Z (mostly optional?)
87 
88  // extra data that's useful but not written out
89  bool second_score_valid; // do we have a second score?
90  };
91 
92 public:
94  ~SamOutput();
95 
96  void header() { output_header(); }
97 
102  void process(struct HostOutputBatchSE& batch);
103 
108  void process(struct HostOutputBatchPE& batch);
109 
110  void close(void);
111 
112 private:
113  // write a printf-style formatted string to the file (preceded by a \t)
114  void write_formatted_string(const char *fmt, ...);
115  // write a plain string
116  // tab controls whether to output a \t before the string
117  void write_string(const char *str, bool tab = true);
118  // write an integer
119  template <typename T> void write_int(T i, bool tab = true);
120  // add a line break
121  void linebreak();
122 
123  // write a SAM tag
124  template <typename T>
125  void write_tag(const char *name, T value);
126 
127  // output the SAM file header
128  void output_header(void);
129  // output an alignment
130  void output_alignment(const struct SamAlignment& aln);
131 
132  // process a single alignment from the stream and output it
133  uint32 process_one_alignment(const AlignmentData& alignment,
134  const AlignmentData& mate);
135 
136  // generate a CIGAR string from the alignment data
137  uint32 generate_cigar_string(SamAlignment& sam_align,
138  const AlignmentData& alignment);
139  // generate the MD string from the internal representation
140  uint32 generate_md_string(SamAlignment& sam_align, const AlignmentData& alignment);
141 
142  // our file pointer
143  FILE *fp;
144 
145  Mutex mutex;
146 };
147 
148 } // namespace io
149 } // namespace nvbio