NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
output_sam.cpp
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
29 #include <nvbio/basic/numbers.h>
30 
31 #include <stdio.h>
32 #include <stdarg.h>
33 
34 namespace nvbio {
35 namespace io {
36 
37 SamOutput::SamOutput(const char *file_name, AlignmentType alignment_type, BNT bnt)
38  : OutputFile(file_name, alignment_type, bnt)
39 {
40  fp = fopen(file_name, "wt");
41  if (fp == NULL)
42  {
43  log_error(stderr, "SamOutput: could not open %s for writing\n", file_name);
44  return;
45  }
46 
47  // set a 256kb output buffer on fp and make sure it's not line buffered
48  // this makes sure small fwrites do not land on disk straight away
49  // (256kb was chosen based on the default stripe size for Linux mdraid RAID-5 volumes)
50  setvbuf(fp, NULL, _IOFBF, 256 * 1024);
51 }
52 
54 {
55  if (fp)
56  {
57  fclose(fp);
58  fp = NULL;
59  }
60 }
61 
62 void SamOutput::write_formatted_string(const char *fmt, ...)
63 {
64  va_list args;
65 
66  fwrite("\t", 1, 1, fp);
67  va_start(args, fmt);
68  vfprintf(fp, fmt, args);
69 }
70 
71 void SamOutput::write_string(const char *str, bool tab)
72 {
73  if (tab)
74  fwrite("\t", 1, 1, fp);
75 
76  fwrite(str, 1, strlen(str), fp);
77 }
78 
79 namespace {
80 // utility function to convert an int to a base-10 string representation
81 template <typename T> int itoa(char *buf, T in)
82 {
83  int len = 0;
84  bool negative = false;
85 
86  // track the sign
87  if (in < 0)
88  {
89  negative = true;
90  in = -int(in);
91  }
92 
93  // convert to base10
94  do
95  {
96  buf[len] = "0123456789"[in % 10];
97  in /= 10;
98  len++;
99  } while(in);
100 
101  // add sign
102  if (negative)
103  {
104  buf[len] = '-';
105  len++;
106  }
107 
108  // reverse
109  for(int c = 0; c < len / 2; c++)
110  {
111  char tmp;
112  tmp = buf[c];
113  buf[c] = buf[len - c - 1];
114  buf[len - c - 1] = tmp;
115  }
116 
117  // terminate
118  buf[len] = 0;
119  return len;
120 }
121 }
122 
123 template<typename T>
124 void SamOutput::write_int(T i, bool tab)
125 {
126  char str[32];
127  int len;
128 
129  len = itoa(str, i);
130 
131  if (tab)
132  {
133  fwrite("\t", 1, 1, fp);
134  }
135 
136  fwrite(str, len, 1, fp);
137 }
138 
139 template<typename T>
140 void SamOutput::write_tag(const char *name, T value)
141 {
142  write_string(name);
143  write_string(":i:", false);
144  write_int(value, false);
145 }
146 
147 template<>
148 void SamOutput::write_tag(const char *name, const char *value)
149 {
150  write_string(name);
151  write_string(":Z:", false);
152  write_string(value, false);
153 }
154 
155 void SamOutput::linebreak(void)
156 {
157  fwrite("\n", 1, 1, fp);
158 }
159 
160 void SamOutput::output_header(void)
161 {
162  write_string("@HD", false);
163  write_string("VN:1.3");
164  linebreak();
165 
166  if (!rg_id.empty())
167  {
168  // write the RG:ID
169  write_string("@RG", false);
170  write_formatted_string("ID:%s",rg_id.c_str());
171 
172  // write the other RG tags
173  if (!rg_string.empty())
174  write_string( rg_string.c_str(), false );
175 
176  linebreak();
177  }
178 
179  write_string("@PG", false);
180  write_formatted_string("ID:%s",pg_id.c_str());
181  write_formatted_string("PN:%s",pg_name.c_str());
182  write_formatted_string("VN:%s",pg_version.c_str());
183  write_formatted_string("CL:\"%s\"",pg_args.c_str());
184  linebreak();
185 
186  // output the sequence info
187  for (uint32 i = 0; i < bnt.n_seqs; i++)
188  {
189  // sequence header
190  write_string("@SQ", false);
191  write_formatted_string("SN:%s", bnt.names + bnt.names_index[i]);
192  write_formatted_string("LN:%d", bnt.sequence_index[i+1] - bnt.sequence_index[i]);
193  linebreak();
194  }
195 }
196 
197 // generate the alignment's CIGAR string
198 // returns the computed length of the corresponding read based on the CIGAR operations
199 uint32 SamOutput::generate_cigar_string(struct SamAlignment& sam_align,
200  const AlignmentData& alignment)
201 {
202  char *output = sam_align.cigar;
203  uint32 read_len = 0;
204 
205  for(uint32 i = 0; i < alignment.cigar_len; i++)
206  {
207  const Cigar& cigar_entry = alignment.cigar[alignment.cigar_len - i - 1u];
208  const char cigar_op = "MIDS"[cigar_entry.m_type];
209  int len;
210 
211  // output count
212  len = itoa(output, cigar_entry.m_len);
213  output += len;
214  // output CIGAR op
215  *output = cigar_op;
216  output++;
217 
218  // check that we didn't overflow the CIGAR buffer
219  assert((unsigned long)(output - sam_align.cigar) < (unsigned long) (sizeof(sam_align.cigar) - 1));
220 
221  // keep track of number of BPs in the original read
222  if (cigar_op != 'D')
223  read_len += cigar_entry.m_len;
224  }
225 
226  // terminate the output string
227  *output = '\0';
228  return read_len;
229 }
230 
231 
232 // generate the MD string
233 uint32 SamOutput::generate_md_string(SamAlignment& sam_align, const AlignmentData& alignment)
234 {
235  if (alignment.mds_vec == NULL)
236  {
237  log_warning(stderr, " SAM: alignment %u from read %u has an empty MD string\n", alignment.aln_id, alignment.read_id);
238  sam_align.md_string[0] = '\0';
239  return 0;
240  }
241  const uint32 mds_len = uint32(alignment.mds_vec[0]) | (uint32(alignment.mds_vec[1]) << 8);
242 
243  char *buffer = sam_align.md_string;
244  uint32 buffer_len = 0;
245 
246  uint32 i;
247 
248  sam_align.mm = 0;
249  sam_align.gapo = 0;
250  sam_align.gape = 0;
251 
252  i = 2;
253  do
254  {
255  const uint8 op = alignment.mds_vec[i++];
256  switch (op)
257  {
258  case MDS_MATCH:
259  {
260  uint8 l = alignment.mds_vec[i++];
261 
262  // prolong the MDS match if it spans multiple tokens
263  while (i < mds_len && alignment.mds_vec[i] == MDS_MATCH)
264  l += alignment.mds_vec[i++];
265 
266  buffer_len += itoa(buffer + buffer_len, l);
267  }
268 
269  break;
270 
271  case MDS_MISMATCH:
272  {
273  const char c = dna_to_char(alignment.mds_vec[i++]);
274  buffer[buffer_len++] = c;
275 
276  sam_align.mm++;
277  }
278 
279  break;
280 
281  case MDS_INSERTION:
282  {
283  const uint8 l = alignment.mds_vec[i++];
284  i += l;
285 
286  sam_align.gapo++;
287  sam_align.gape += l - 1;
288  }
289 
290  break;
291 
292  case MDS_DELETION:
293  {
294  const uint8 l = alignment.mds_vec[i++];
295 
296  buffer[buffer_len++] = '^';
297  for(uint8 n = 0; n < l; n++)
298  {
299  buffer[buffer_len++] = dna_to_char(alignment.mds_vec[i++]);
300  }
301 
302  buffer[buffer_len++] = '0';
303 
304  sam_align.gapo++;
305  sam_align.gape += l - 1;
306  }
307 
308  break;
309  }
310  } while(i < mds_len);
311 
312  buffer[buffer_len] = '\0';
313  return buffer_len;
314 }
315 
316 // output a SAM alignment (but not tags)
317 void SamOutput::output_alignment(const struct SamAlignment& sam_align)
318 {
319  write_string(sam_align.qname, false);
320  write_int((uint32)sam_align.flags);
321 
322 // log_verbose(stderr, "%s: ed(NM)=%d score(AS)=%d second_score(XS)=%d mm(XM)=%d gapo(XO)=%d gape(XG)=%d\n",
323 // sam_align.qname,
324 // sam_align.ed, sam_align.score, sam_align.second_score, sam_align.mm, sam_align.gapo, sam_align.gape);
325  if (sam_align.flags & SAM_FLAGS_UNMAPPED)
326  {
327  // output * or 0 for every other required field
328  write_string("*\t0\t0\t*\t*\t0\t0");
329  write_string(sam_align.seq);
330  write_string(sam_align.qual);
331 
332  linebreak();
333 
334  return;
335  }
336 
337  write_string(sam_align.rname);
338  write_int(sam_align.pos);
339  write_int(sam_align.mapq);
340  write_string(sam_align.cigar);
341 
342  if (sam_align.rnext)
343  write_string(sam_align.rnext);
344  else
345  write_string("*");
346 
347  write_int(sam_align.pnext);
348  write_int(sam_align.tlen);
349 
350  write_string(sam_align.seq);
351  write_string(sam_align.qual);
352 
353  write_tag("NM", sam_align.ed);
354  write_tag("AS", sam_align.score);
355  if (sam_align.second_score_valid)
356  write_tag("XS", sam_align.second_score);
357 
358  write_tag("XM", sam_align.mm);
359  write_tag("XO", sam_align.gapo);
360  write_tag("XG", sam_align.gape);
361  if (sam_align.md_string[0])
362  write_tag("MD", sam_align.md_string);
363  else
364  write_tag("MD", "*");
365 
366  linebreak();
367 }
368 
369 uint32 SamOutput::process_one_alignment(const AlignmentData& alignment,
370  const AlignmentData& mate)
371 {
372  // output SamAlignment structure, to be filled out and sent to output_alignment
373  SamAlignment sam_align;
374 
375  const uint32 ref_cigar_len = reference_cigar_length(alignment.cigar, alignment.cigar_len);
376 
377  // setup alignment information
378  const uint32 seq_index = uint32(std::upper_bound(
381  alignment.cigar_pos ) - bnt.sequence_index) - 1u;
382 
383  // if we're doing paired-end alignment, the mate must be valid
384  NVBIO_CUDA_ASSERT(alignment_type == SINGLE_END || mate.valid == true);
385 
386  // fill out read name
387  sam_align.qname = alignment.read_name;
388 
389  // fill out sequence data
390  for(uint32 i = 0; i < alignment.read_len; i++)
391  {
392  uint8 s;
393 
394  if (alignment.aln->m_rc)
395  {
396  nvbio::complement_functor<4> complement;
397  s = complement(alignment.read_data[i]);
398  }
399  else
400  s = alignment.read_data[alignment.read_len - i - 1];
401 
402  sam_align.seq[i] = dna_to_char(s);
403  }
404  sam_align.seq[alignment.read_len] = '\0';
405 
406  // fill out quality data
407  for(uint32 i = 0; i < alignment.read_len; i++)
408  {
409  char q;
410 
411  if (alignment.aln[MATE_1].m_rc)
412  q = alignment.qual[i];
413  else
414  q = alignment.qual[alignment.read_len - i - 1];
415 
416  sam_align.qual[i] = q + 33;
417  }
418  sam_align.qual[alignment.read_len] = '\0';
419 
420  // compute mapping quality
421  sam_align.mapq = alignment.mapq;
422 
423  // if we didn't map, or mapped with low quality, output an unmapped alignment and return
424  if (!(alignment.aln->is_aligned() || sam_align.mapq < mapq_filter))
425  {
426  sam_align.flags = SAM_FLAGS_UNMAPPED;
427  // mark the md string as empty
428  sam_align.md_string[0] = '\0';
429 
430  // unaligned reads don't need anything else; output and return
431  output_alignment(sam_align);
432  return 0;
433  }
434 
435  // compute alignment flags
436  sam_align.flags = (alignment.aln->mate() ? SAM_FLAGS_READ_2 : SAM_FLAGS_READ_1);
437  if (alignment.aln->m_rc)
438  sam_align.flags |= SAM_FLAGS_REVERSE;
439 
440  if (alignment_type == PAIRED_END)
441  {
442  NVBIO_CUDA_ASSERT(mate.valid);
443 
444  sam_align.flags |= SAM_FLAGS_PAIRED;
445 
446  if (mate.aln->is_concordant())
447  sam_align.flags |= SAM_FLAGS_PROPER_PAIR;
448 
449  if (!mate.aln->is_aligned())
450  sam_align.flags |= SAM_FLAGS_MATE_UNMAPPED;
451 
452  if (mate.aln->is_rc())
453  sam_align.flags |= SAM_FLAGS_MATE_REVERSE;
454  }
455 
456  if (alignment.cigar_pos + ref_cigar_len > bnt.sequence_index[ seq_index+1 ])
457  {
458  // flag UNMAP as this alignment bridges two adjacent reference sequences
459  // xxxnsubtil: we still output the rest of the alignment data, does that make sense?
460  sam_align.flags |= SAM_FLAGS_UNMAPPED;
461  // unmapped segments get their mapq set to 0
462  sam_align.mapq = 0;
463  }
464 
465  sam_align.rname = bnt.names + bnt.names_index[ seq_index ];
466  sam_align.pos = uint32( alignment.cigar_pos - bnt.sequence_index[ seq_index ] + 1 );
467 
468  // fill out the cigar string...
469  uint32 computed_cigar_len = generate_cigar_string(sam_align, alignment);
470  // ... and make sure it makes (some) sense
471  if (computed_cigar_len != alignment.read_len)
472  {
473  log_error(stderr, "SAM output : cigar length doesn't match read %u (%u != %u)\n",
474  alignment.read_id /* xxxnsubtil: global_read_id */,
475  computed_cigar_len, alignment.read_len);
476  return sam_align.mapq;
477  }
478 
479  if (alignment_type == PAIRED_END)
480  {
481  if (mate.aln->is_aligned())
482  {
483  const uint32 o_ref_cigar_len = reference_cigar_length(mate.cigar, mate.cigar_len);
484 
485  // setup alignment information for the mate
486  const uint32 o_seq_index = uint32(std::upper_bound(
489  mate.cigar_pos ) - bnt.sequence_index) - 1u;
490 
491  if (o_seq_index == seq_index)
492  sam_align.rnext = "=";
493  else
494  sam_align.rnext = bnt.names + bnt.names_index[ o_seq_index ];
495 
496  sam_align.pnext = uint32( mate.cigar_pos - bnt.sequence_index[ o_seq_index ] + 1 );
497  if (o_seq_index != seq_index)
498  sam_align.tlen = 0;
499  else
500  {
501  sam_align.tlen = nvbio::max(mate.cigar_pos + o_ref_cigar_len,
502  alignment.cigar_pos + ref_cigar_len) -
503  nvbio::min(mate.cigar_pos, alignment.cigar_pos);
504 
505  if (mate.cigar_pos < alignment.cigar_pos)
506  sam_align.tlen = -sam_align.tlen;
507  }
508  } else {
509  // other mate is unmapped
510  sam_align.rnext = "=";
511  sam_align.pnext = (int)(alignment.cigar_pos - bnt.sequence_index[ seq_index ] + 1);
512  // xxx: check whether this is really correct...
513  sam_align.tlen = 0;
514  }
515  } else {
516  sam_align.rnext = NULL;
517  sam_align.pnext = 0;
518  sam_align.tlen = 0;
519  }
520 
521  // fill out tag data
522  sam_align.ed = alignment.aln->ed();
523  sam_align.score = alignment.aln->score();
524 
525  sam_align.second_score_valid = false; // TODO!
526 
527  generate_md_string(sam_align, alignment);
528 
529  // write out the alignment
530  output_alignment(sam_align);
531 
532  return sam_align.mapq;
533 }
534 
536 {
537  float time = 0.0f;
538  {
539  ScopedTimer<float> timer( &time );
540  ScopedLock lock( &mutex );
541 
542  for(uint32 c = 0; c < batch.count; c++)
543  {
544  AlignmentData alignment = get(batch, c);
546 
547  process_one_alignment(alignment, mate);
548  }
549  }
550  iostats.n_reads += batch.count;
551  iostats.output_process_timings.add( batch.count, time );
552 }
553 
555 {
556  float time = 0.0f;
557  {
558  ScopedTimer<float> timer( &time );
559  ScopedLock lock( &mutex );
560 
561  for(uint32 c = 0; c < batch.count; c++)
562  {
563  AlignmentData alignment = get_anchor_mate(batch,c);
564  AlignmentData mate = get_opposite_mate(batch,c);
565 
566  process_one_alignment(alignment, mate);
567  process_one_alignment(mate, alignment);
568  }
569  }
570  iostats.n_reads += batch.count;
571  iostats.output_process_timings.add( batch.count, time );
572 }
573 
575 {
576  fclose(fp);
577  fp = NULL;
578 }
579 
580 } // namespace io
581 } // namespace nvbio