NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
reads.cpp
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include <nvbio/io/reads/reads.h>
32 #include <nvbio/io/reads/sam.h>
33 #include <nvbio/io/reads/bam.h>
34 #include <nvbio/basic/console.h>
36 #include <nvbio/basic/timer.h>
37 #include <cuda_runtime.h>
38 
39 #include <string.h>
40 
41 namespace nvbio {
42 namespace io {
43 
44 // factory method to open a read file, tries to detect file type based on file name
45 ReadDataStream *open_read_file(const char* read_file_name,
46  const QualityEncoding qualities,
47  const uint32 max_reads,
48  const uint32 truncate_read_len,
49  const ReadEncoding flags)
50 {
51  // parse out file extension; look for .fastq.gz, .fastq suffixes
52  uint32 len = uint32( strlen(read_file_name) );
53  bool is_gzipped = false;
54 
55  // do we have a .gz suffix?
56  if (len >= strlen(".gz"))
57  {
58  if (strcmp(&read_file_name[len - strlen(".gz")], ".gz") == 0)
59  {
60  is_gzipped = true;
61  len = uint32(len - strlen(".gz"));
62  }
63  }
64 
65  // check for fastq suffix
66  if (len >= strlen(".fastq"))
67  {
68  if (strncmp(&read_file_name[len - strlen(".fastq")], ".fastq", strlen(".fastq")) == 0)
69  {
70  return new ReadDataFile_FASTQ_gz(read_file_name,
71  qualities,
72  max_reads,
73  truncate_read_len,
74  flags);
75  }
76  }
77 
78  // check for fastq suffix
79  if (len >= strlen(".fq"))
80  {
81  if (strncmp(&read_file_name[len - strlen(".fq")], ".fq", strlen(".fq")) == 0)
82  {
83  return new ReadDataFile_FASTQ_gz(read_file_name,
84  qualities,
85  max_reads,
86  truncate_read_len,
87  flags);
88  }
89  }
90 
91  // check for txt suffix
92  if (len >= strlen(".txt"))
93  {
94  if (strncmp(&read_file_name[len - strlen(".txt")], ".txt", strlen(".txt")) == 0)
95  {
96  return new ReadDataFile_TXT_gz(read_file_name,
97  qualities,
98  max_reads,
99  truncate_read_len,
100  flags);
101  }
102  }
103 
104  // check for sam suffix
105  if (len >= strlen(".sam"))
106  {
107  if (strncmp(&read_file_name[len - strlen(".sam")], ".sam", strlen(".sam")) == 0)
108  {
109  ReadDataFile_SAM *ret;
110 
111  ret = new ReadDataFile_SAM(read_file_name,
112  max_reads,
113  truncate_read_len,
114  flags);
115 
116  if (ret->init() == false)
117  {
118  delete ret;
119  return NULL;
120  }
121 
122  return ret;
123  }
124  }
125 
126  // check for bam suffix
127  if (len >= strlen(".bam"))
128  {
129  if (strncmp(&read_file_name[len - strlen(".bam")], ".bam", strlen(".bam")) == 0)
130  {
131  ReadDataFile_BAM *ret;
132 
133  ret = new ReadDataFile_BAM(read_file_name,
134  max_reads,
135  truncate_read_len,
136  flags);
137 
138  if (ret->init() == false)
139  {
140  delete ret;
141  return NULL;
142  }
143 
144  return ret;
145  }
146  }
147 
148  // we don't actually know what this is; guess fastq
149  log_warning(stderr, "could not determine file type for %s; guessing %sfastq\n", read_file_name, is_gzipped ? "compressed " : "");
150  return new ReadDataFile_FASTQ_gz(read_file_name,
151  qualities,
152  max_reads,
153  truncate_read_len,
154  flags);
155 }
156 
157 namespace { // anonymous
158 
159 // converts ASCII characters for amino-acids into
160 // a 5 letter alphabet for { A, C, G, T, N }.
161 inline unsigned char nst_nt4_encode(unsigned char c)
162 {
163  static unsigned char nst_nt4_table[256] = {
164  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
165  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
166  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4,
167  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
168  4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
169  4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
170  4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
171  4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
172  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
173  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
174  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
175  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
176  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
177  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
178  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
179  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
180  };
181 
182  return nst_nt4_table[c];
183 }
184 
185 // convert a quality value in one of the supported encodings to Phred
186 template <QualityEncoding encoding>
187 inline unsigned char convert_to_phred_quality(const uint8 q)
188 {
189  // this table maps Solexa quality values to Phred scale
190  static unsigned char s_solexa_to_phred[] = {
191  0, 1, 1, 1, 1, 1, 1, 2, 2, 3,
192  3, 4, 4, 5, 5, 6, 7, 8, 9, 10,
193  10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
194  20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
195  30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
196  40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
197  50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
198  60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
199  70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
200  80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
201  90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
202  100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
203  110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
204  120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
205  130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
206  140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
207  150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
208  160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
209  170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
210  180, 181, 182, 183, 184, 185, 186, 187, 188, 189,
211  190, 191, 192, 193, 194, 195, 196, 197, 198, 199,
212  200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
213  210, 211, 212, 213, 214, 215, 216, 217, 218, 219,
214  220, 221, 222, 223, 224, 225, 226, 227, 228, 229,
215  230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
216  240, 241, 242, 243, 244, 245, 246, 247, 248, 249,
217  250, 251, 252, 253, 254, 255
218  };
219 
220  switch(encoding)
221  {
222  case Phred:
223  return q;
224 
225  case Phred33:
226  return q - 33;
227 
228  case Phred64:
229  return q - 64;
230 
231  case Solexa:
232  return s_solexa_to_phred[q];
233  }
234 
235  // gcc is dumb
236  return q;
237 }
238 
239 // complement a base pair
240 inline unsigned char complement_bp(unsigned char bp)
241 {
242  switch(bp)
243  {
244  case 'A':
245  return 'T';
246 
247  case 'a':
248  return 't';
249 
250  case 'T':
251  return 'A';
252 
253  case 't':
254  return 'a';
255 
256  case 'G':
257  return 'C';
258 
259  case 'g':
260  return 'c';
261 
262  case 'C':
263  return 'G';
264 
265  case 'c':
266  return 'g';
267 
268  default:
269  return bp;
270  }
271 }
272 
273 } // anonymous namespace
274 
276  : ReadData()
277 {
278  // old mechanism employed before introducing reserve()
279  //m_read_vec.reserve( 8*1024*1024 );
280  //m_qual_vec.reserve( 64*1024*1024 );
281 
282  m_read_index_vec.reserve( 16*1024 );
283  m_read_index_vec.resize( 1u );
284  m_read_index_vec[0] = 0u;
285 
286  m_name_index_vec.reserve( 16*1024 );
287  m_name_index_vec.resize( 1u );
288  m_name_index_vec[0] = 0u;
289 }
290 
291 #if defined(PRESIZED_VECTORS)
292  #define PRESIZE_VECTORS(vec, x) vec.resize(x)
293  #define ADJUST_VECTORS(vec, x) vec.resize(x)
294  #define RESIZE_VECTORS(vec, x) if (vec.size() < x) vec.resize(x)
295 #else
296  #define PRESIZE_VECTORS(vec, x)
297  #define ADJUST_VECTORS(vec, x)
298  #define RESIZE_VECTORS(vec, x) vec.resize(x)
299 #endif
300 
301 // reserve enough storage for a given number of reads and bps
302 //
303 void ReadDataRAM::reserve(const uint32 n_reads, const uint32 n_bps)
304 {
305  // a default read id length used to reserve enough space upfront and avoid frequent allocations
306  const uint32 AVG_NAME_LENGTH = 250;
307 
308  const uint32 bps_per_word = 32u / ReadData::READ_BITS;
309 
310  m_read_vec.reserve( n_bps / bps_per_word );
311  m_qual_vec.reserve( n_bps );
312  m_read_index_vec.reserve( n_reads+1 );
313  m_name_index_vec.reserve( AVG_NAME_LENGTH * n_reads );
314  m_name_index_vec.reserve( n_reads+1 );
315 
316  // pre-size the vectors
317  PRESIZE_VECTORS( m_read_vec, n_bps / bps_per_word );
318  PRESIZE_VECTORS( m_qual_vec, n_bps );
319 }
320 
321 // signals that the batch is complete
323 {
324  assert(m_read_stream_words == (m_read_stream_len + 7) / 8);
325 
326  m_avg_read_len = (uint32) ceilf(float(m_read_stream_len) / float(m_n_reads));
327 
328  // adjust the vector sizes
331 
332  // set the stream pointers
336 
339 }
340 
341 // a small read class supporting REVERSE | COMPLEMENT operations
342 //
343 template <ReadDataRAM::StrandOp FLAGS>
345 {
346  // constructor
347  read_string(const uint32 len, const uint8* read, const uint8* qual) : m_len(len), m_read(read), m_qual(qual) {}
348 
349  // string length
350  uint32 length() const { return m_len; }
351 
352  // indexing operator
353  uint8 operator[] (const uint32 i) const
354  {
355  const uint32 index = (FLAGS & ReadDataRAM::REVERSE_OP) ? m_len - i - 1u : i;
356 
357  // fetch the bp
358  const uint8 bp = nst_nt4_encode( m_read[index] );
359 
361  return bp < 4u ? 3u - bp : 4u;
362 
363  return bp;
364  }
365 
366  // quality operator
367  uint8 quality(const uint32 i) const
368  {
369  const uint32 index = (FLAGS & REVERSE) ? m_len - i - 1u : i;
370 
371  return m_qual[index];
372  }
373 
374  const uint32 m_len;
375  const uint8* m_read;
376  const uint8* m_qual;
377 };
378 
379 // encode a read according to a compile-time quality-encoding
380 //
381 template <QualityEncoding quality_encoding, typename read_type>
382 void encode(
383  const read_type read,
385  char* qual_stream)
386 {
387  #if 1
388 
389  // use the custom PackedStream assign() method
390  assign( read.length(), read, stream );
391 
392  // naive serial implementation
393  for (uint32 i = 0; i < read.length(); i++)
394  qual_stream[i] = convert_to_phred_quality<quality_encoding>(read.quality(i));
395 
396  #else
397  // naive serial implementation
398  for (uint32 i = 0; i < read.length(); i++)
399  {
400  stream[i] = read[i];
401  qual_stream[i] = convert_to_phred_quality<quality_encoding>(read.quality(i));
402  }
403  #endif
404 }
405 
406 // encode a read according to some compile-time flags and run-time quality-encoding
407 //
408 template <typename read_type>
409 void encode(
410  const QualityEncoding quality_encoding,
411  const read_type read,
413  char* qual_stream)
414 {
415  switch (quality_encoding)
416  {
417  case Phred:
418  encode<Phred>( read, stream, qual_stream );
419  break;
420  case Phred33:
421  encode<Phred33>( read, stream, qual_stream );
422  break;
423  case Phred64:
424  encode<Phred64>( read, stream, qual_stream );
425  break;
426  case Solexa:
427  encode<Solexa>( read, stream, qual_stream );
428  break;
429  }
430 }
431 
432 // encode a read according to some given run-time flags and quality-encoding
433 //
434 void encode(
435  const ReadDataRAM::StrandOp conversion_flags,
436  const QualityEncoding quality_encoding,
437  const uint32 read_len,
438  const uint8* read,
439  const uint8* quality,
441  char* qual_stream)
442 {
443 
444  const read_string<ReadDataRAM::REVERSE_OP> r_read( read_len, read, quality );
445  const read_string<ReadDataRAM::REVERSE_COMPLEMENT_OP> rc_read( read_len, read, quality );
446  const read_string<ReadDataRAM::COMPLEMENT_OP> fc_read( read_len, read, quality );
447  const read_string<ReadDataRAM::NO_OP> f_read( read_len, read, quality );
448 
449  if (conversion_flags & ReadDataRAM::REVERSE_OP)
450  {
451  if (conversion_flags & ReadDataRAM::COMPLEMENT_OP)
452  encode( quality_encoding, rc_read, stream, qual_stream );
453  else
454  encode( quality_encoding, r_read, stream, qual_stream );
455  }
456  else
457  {
458  if (conversion_flags & ReadDataRAM::COMPLEMENT_OP)
459  encode( quality_encoding, fc_read, stream, qual_stream );
460  else
461  encode( quality_encoding, f_read, stream, qual_stream );
462  }
463 }
464 
465 // add a read to this batch
467  const char *name,
468  const uint8* read,
469  const uint8* quality,
470  const QualityEncoding quality_encoding,
471  const uint32 truncate_read_len,
472  const StrandOp conversion_flags)
473 {
474  // truncate read
475  // xxx: should we do this silently?
476  read_len = nvbio::min(read_len, truncate_read_len);
477 
478  assert(read_len);
479 
480  // resize the reads & quality buffers
481  {
482  static const uint32 bps_per_word = 32u / ReadData::READ_BITS;
483  const uint32 stream_len = m_read_stream_len + read_len;
484  const uint32 words = (stream_len + bps_per_word - 1) / bps_per_word;
485 
486  RESIZE_VECTORS( m_read_vec, words );
487  RESIZE_VECTORS( m_qual_vec, stream_len );
488 
490  }
491 
492  // encode the read data
494  encode(
495  conversion_flags,
496  quality_encoding,
497  read_len,
498  read,
499  quality,
500  stream.begin() + m_read_stream_len,
502 
503  // update read and bp counts
504  m_n_reads++;
505  m_read_stream_len += read_len;
507 
510 
511  // store the read name
512  const uint32 name_len = uint32(strlen(name));
513  const uint32 name_offset = m_name_stream_len;
514 
515  m_name_vec.resize(name_offset + name_len + 1);
516  //strcpy(&m_name_vec[name_offset], name);
517  memcpy(&m_name_vec[name_offset],name,name_len + 1);
518 
519  m_name_stream_len += name_len + 1;
521 }
522 
523 // utility function to alloc and copy a vector in device memory
524 template <typename T>
525 static void cudaAllocAndCopyVector(T*& dst, const T* src, const uint32 words, uint64& allocated)
526 {
527  const uint32 words4 = 4u * ((words + 3u) / 4u);
528  if (src)
529  {
530  cudaMalloc( &dst, sizeof(T) * words4 );
531  if (dst == NULL)
532  throw std::bad_alloc(WINONLY("ReadDataDevice: not enough device memory"));
533 
534  cudaMemcpy( dst, src, sizeof(T) * words, cudaMemcpyHostToDevice );
535 
536  allocated += words4 * sizeof(T);
537  }
538  else
539  dst = NULL;
540 }
541 
542 ReadDataDevice::ReadDataDevice(const ReadData& host_data, const uint32 flags)
543  : ReadData(),
544  m_allocated( 0 )
545 {
546  m_name_stream_len = 0;
547  m_name_stream = NULL;
548  m_name_index = NULL;
549  m_qual_stream = NULL;
550  m_read_stream = NULL;
551  m_read_index = NULL;
552 
553  m_n_reads = host_data.m_n_reads;
554  m_read_stream_len = 0;
556 
557  m_min_read_len = host_data.m_min_read_len;
558  m_max_read_len = host_data.m_max_read_len;
559  m_avg_read_len = host_data.m_avg_read_len;
560 
561  if (flags & READS)
562  {
565 
566  cudaAllocAndCopyVector( m_read_stream, host_data.m_read_stream, m_read_stream_words, m_allocated );
567  cudaAllocAndCopyVector( m_read_index, host_data.m_read_index, m_n_reads+1, m_allocated );
568  }
569  if (flags & QUALS)
570  cudaAllocAndCopyVector( m_qual_stream, host_data.m_qual_stream, m_read_stream_len, m_allocated );
571 }
572 
574 {
575  if (m_read_stream)
576  cudaFree( m_read_stream );
577 
578  if (m_read_index)
579  cudaFree( m_read_index );
580 
581  if (m_qual_stream)
582  cudaFree( m_qual_stream );
583 }
584 
585 // grab the next batch of reads into a host memory buffer
586 ReadData *ReadDataFile::next(const uint32 batch_size, const uint32 batch_bps)
587 {
588  const uint32 reads_to_load = std::min(m_max_reads - m_loaded, batch_size);
589 
590  if (!is_ok() || reads_to_load == 0)
591  return NULL;
592 
593  // a default average read length used to reserve enough space
594  const uint32 AVG_READ_LENGTH = 100;
595 
596  ReadDataRAM *reads = new ReadDataRAM();
597  reads->reserve(
598  batch_size,
599  batch_bps == uint32(-1) ? batch_size * AVG_READ_LENGTH : batch_bps ); // try to use a default read length
600 
601  while (reads->size() < reads_to_load &&
602  reads->bps() < batch_bps)
603  {
604  // load 100 at a time if possible
605  const uint32 chunk_reads = nvbio::min(reads_to_load - reads->size(), uint32(100));
606  const uint32 chunk_bps = batch_bps - reads->bps();
607 
608  const int n = nextChunk(reads, chunk_reads, chunk_bps);
609  assert(n <= (int) chunk_reads);
610  if (n == 0)
611  break;
612 
613  assert(reads->size() <= reads_to_load);
614  }
615 
616  if (reads->size() == 0)
617  {
618  delete reads;
619  return NULL;
620  }
621 
622  m_loaded += reads->size();
623 
624  reads->end_batch();
625 
626  return reads;
627 }
628 
629 } // namespace io
630 } // namespace nvbio