NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
reads.h
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #pragma once
29 
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include <vector>
37 
38 namespace nvbio {
39 namespace io {
40 
58 
61 
73 
74 // describes the quality encoding for a given read file
76 {
77  // phred quality
78  Phred = 0,
79  // phred quality + 33
80  Phred33 = 1,
81  // phred quality + 64
82  Phred64 = 2,
83  Solexa = 3,
84 };
85 
86 // a set of flags describing the types of supported read strands
88 {
89  FORWARD = 0x0001,
90  REVERSE = 0x0002,
93 };
94 
95 // how mates of a paired-end read are encoded
96 // F = forward, R = reverse
98 {
103 };
104 
116 template <
117  typename IndexIterator,
118  typename ReadStorageIterator,
119  typename QualStorageIterator,
120  typename NameStorageIterator>
122 {
123  typedef IndexIterator index_iterator;
125 
126  typedef ReadStorageIterator read_storage_iterator;
128 
129  typedef QualStorageIterator qual_storage_iterator;
131 
132  typedef NameStorageIterator name_storage_iterator;
134 
135  // symbol size for reads
136  static const uint32 READ_BITS = 4;
137  // big endian?
138  static const bool HI_BITS = false; // deprecated
139  // big endian?
140  static const bool READ_BIG_ENDIAN = false;
141  // symbols per word
142  static const uint32 READ_SYMBOLS_PER_WORD = (4*sizeof(uint32))/READ_BITS;
143 
144  typedef PackedStream<
146  typedef PackedStream<
148 
151 
154 
155  typedef ConcatenatedStringSet<
158 
159  typedef ConcatenatedStringSet<
162 
163  typedef ConcatenatedStringSet<
166 
167  typedef ConcatenatedStringSet<
170 
171  typedef ConcatenatedStringSet<
174 
175  typedef ConcatenatedStringSet<
178 
183  : m_n_reads(0),
187  m_min_read_len(uint32(-1)),
188  m_max_read_len(0),
189  m_avg_read_len(0)
190  {};
191 
194  template <
195  typename InIndexIterator,
196  typename InReadIterator,
197  typename InQualIterator,
198  typename InNameIterator>
201  : m_n_reads (in.m_n_reads),
202  m_name_stream (NameStorageIterator(in.m_name_stream)),
204  m_name_index (IndexIterator(in.m_name_index)),
205  m_read_stream (ReadStorageIterator(in.m_read_stream)),
208  m_read_index (IndexIterator(in.m_read_index)),
209  m_qual_stream (QualStorageIterator(in.m_qual_stream)),
213  {}
214 
221 
228 
235 
243  NVBIO_HOST_DEVICE NVBIO_FORCEINLINE uint2 get_range(const uint32 i) const { return make_uint2(m_read_index[i],m_read_index[i+1]); }
244 
248  {
249  return read_string_set_type(
250  size(),
251  read_stream().begin(),
252  read_index() );
253  }
254 
258  {
260  size(),
261  read_stream().begin(),
262  read_index() );
263  }
264 
268  {
270  size(),
271  read_stream().begin(),
272  read_index() );
273  }
274 
278  {
279  const uint2 read_range = get_range( i );
280  return read_string( read_range.y - read_range.x, read_stream().begin() + read_range.x );
281  }
282 
286  {
287  const uint2 read_range = get_range( i );
288  return const_read_string( read_range.y - read_range.x, read_stream().begin() + read_range.x );
289  }
290 
294  {
295  return qual_string_set_type(
296  size(),
297  qual_stream(),
298  read_index() );
299  }
300 
304  {
306  size(),
307  qual_stream(),
308  read_index() );
309  }
310 
314  {
316  size(),
317  qual_stream(),
318  read_index() );
319  }
320 
324  {
325  return name_string_set_type(
326  size(),
327  name_stream(),
328  name_index() );
329  }
330 
334  {
336  size(),
337  name_stream(),
338  name_index() );
339  }
340 
344  {
346  size(),
347  name_stream(),
348  name_index() );
349  }
350 
351 public:
352  // number of reads in this struct
354 
355  // a pointer to a buffer containing the names of all the reads in this batch
357  // the length (in bytes) of the name_stream buffer
359  // an array of uint32 with the byte indices of the starting locations of each name in name_stream
361 
362  // a pointer to a buffer containing the read data
363  // note that this could point at either host or device memory
365  // the length of read_stream in base pairs
367  // the number of words in read_stream
369  // an array of uint32 with the indices of the starting locations of each read in read_stream (in base pairs)
371 
372  // a pointer to a buffer containing quality data
373  // (the indices in m_read_index are also valid for this buffer)
375 
376  // statistics on the reads: minimum size, maximum size, average size
380 };
381 
386 
391 struct ReadData : public ReadDataCore
392 {
395 
399  {
400  m_name_stream = NULL;
401  m_name_index = NULL;
402  m_read_stream = NULL;
403  m_read_index = NULL;
404  m_qual_stream = NULL;
405  }
406 
409  virtual ~ReadData() {}
410 };
411 
415 struct ReadDataRAM : public ReadData
416 {
419  enum StrandOp
420  {
421  NO_OP = 0x0000,
422  REVERSE_OP = 0x0001,
423  COMPLEMENT_OP = 0x0002,
425  };
426 
427  ReadDataRAM();
428 
431  void reserve(const uint32 n_reads, const uint32 n_bps);
432 
443  void push_back(uint32 read_len,
444  const char* name,
445  const uint8* base_pairs,
446  const uint8* quality,
447  const QualityEncoding quality_encoding,
448  const uint32 truncate_read_len,
449  const StrandOp conversion_flags);
450 
453  void end_batch(void);
454 
455  std::vector<uint32> m_read_vec;
456  std::vector<uint32> m_read_index_vec;
457  std::vector<char> m_qual_vec;
458  std::vector<char> m_name_vec;
459  std::vector<uint32> m_name_index_vec;
460 };
461 
465 struct ReadDataDevice : public ReadData
466 {
467  enum {
468  READS = 0x01,
469  QUALS = 0x02,
470  };
471 
474  ReadDataDevice(const ReadData& host_data, const uint32 flags = READS);
475 
478  ~ReadDataDevice();
479 
480  uint64 allocated() const { return m_allocated; }
481 
482 private:
483  uint64 m_allocated;
484 };
485 
488 
494 {
495  ReadDataStream(uint32 truncate_read_len = uint32(-1))
496  : m_truncate_read_len(truncate_read_len)
497  {
498  };
499 
502  virtual ~ReadDataStream() {}
503 
506  virtual ReadData* next(const uint32 batch_size, const uint32 batch_bps = uint32(-1)) = 0;
507 
510  virtual bool is_ok() = 0;
511 
512  // maximum length of a read; longer reads are truncated to this size
514 };
515 
516 
529 ReadDataStream *open_read_file(const char * read_file_name,
530  const QualityEncoding qualities,
531  const uint32 max_reads = uint32(-1),
532  const uint32 max_read_len = uint32(-1),
533  const ReadEncoding flags = REVERSE);
534 
537 
538 } // namespace io
539 
542 inline
544 {
545  return io::ReadData::plain_view_type( read_data );
546 }
547 
550 inline
552 {
553  return io::ReadData::const_plain_view_type( read_data );
554 }
555 
556 } // namespace nvbio