NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
sequence.h
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #pragma once
29 
30 #include <nvbio/strings/alphabet.h>
34 #include <nvbio/basic/vector.h>
35 #include <nvbio/basic/cuda/ldg.h>
37 
38 namespace nvbio {
39 namespace io {
40 
146 
149 
161 
162 // describes the quality encoding for a given read file
164 {
165  Phred = 0,
166  Phred33 = 1,
167  Phred64 = 2,
168  Solexa = 3,
169 };
170 
171 // a set of flags describing the types of supported read strands
173 {
174  FORWARD = 0x0001,
175  REVERSE = 0x0002,
176  FORWARD_COMPLEMENT = 0x0004,
177  REVERSE_COMPLEMENT = 0x0008,
178 };
179 
180 // a set of flags describing what to load
182 {
183  SEQUENCE_DATA = 0x0001,
184  SEQUENCE_QUALS = 0x0002,
185  SEQUENCE_NAMES = 0x0004,
186 };
187 
188 // how mates of a paired-end read are encoded
189 // F = forward, R = reverse
191 {
192  PE_POLICY_FF = 0,
193  PE_POLICY_FR = 1,
194  PE_POLICY_RF = 2,
195  PE_POLICY_RR = 3,
196 };
197 
202 {
207  : m_alphabet(PROTEIN),
208  m_n_seqs(0),
212  m_has_qualities(0),
216  {};
217 
224  NVBIO_HOST_DEVICE NVBIO_FORCEINLINE bool has_qualities() const { return m_has_qualities ? true : false; }
228 
235 
239 };
240 
245  const SequenceDataInfo& op1,
246  const SequenceDataInfo& op2)
247 {
248  return
249  op1.m_alphabet == op2.m_alphabet &&
250  op1.m_n_seqs == op2.m_n_seqs &&
254  op1.m_has_qualities == op2.m_has_qualities &&
258 }
259 
264  const SequenceDataInfo& op1,
265  const SequenceDataInfo& op2)
266 {
267  return !(op1 == op2);
268 }
269 
276 
296 template <
297  typename IndexIterator = uint32*,
298  typename SequenceStorageIterator = uint32*,
299  typename QualStorageIterator = char*,
300  typename NameStorageIterator = char*>
302 {
303  typedef IndexIterator index_iterator;
304  typedef SequenceStorageIterator sequence_storage_iterator;
305  typedef QualStorageIterator qual_storage_iterator;
306  typedef NameStorageIterator name_storage_iterator;
307 
312 
317 
322  const SequenceDataInfo& info,
323  const SequenceStorageIterator sequence_stream,
324  const IndexIterator sequence_index,
325  const QualStorageIterator qual_stream,
326  const NameStorageIterator name_stream,
327  const IndexIterator name_index)
328  : SequenceDataInfo ( info ),
329  m_name_stream (NameStorageIterator( name_stream )),
330  m_name_index (IndexIterator( name_index )),
331  m_sequence_stream (SequenceStorageIterator( sequence_stream )),
332  m_sequence_index (IndexIterator( sequence_index )),
333  m_qual_stream (QualStorageIterator( qual_stream ))
334  {}
335 
338  template <
339  typename InIndexIterator,
340  typename InSequenceIterator,
341  typename InQualIterator,
342  typename InNameIterator>
345  : SequenceDataInfo ( in ),
346  m_name_stream (NameStorageIterator( in.m_name_stream )),
347  m_name_index (IndexIterator( in.m_name_index )),
348  m_sequence_stream (SequenceStorageIterator( in.m_sequence_stream )),
349  m_sequence_index (IndexIterator( in.m_sequence_index )),
350  m_qual_stream (QualStorageIterator( in.m_qual_stream ))
351  {}
352 
355  template <
356  typename InIndexIterator,
357  typename InSequenceIterator,
358  typename InQualIterator,
359  typename InNameIterator>
362  {
363  // copy the info
364  this->SequenceDataInfo::operator=( in );
365 
366  // copy the iterators
367  m_name_stream = NameStorageIterator( in.m_name_stream );
368  m_name_index = IndexIterator( in.m_name_index );
369  m_sequence_stream = SequenceStorageIterator( in.m_sequence_stream );
370  m_sequence_index = IndexIterator( in.m_sequence_index );
371  m_qual_stream = QualStorageIterator( in.m_qual_stream );
372  return *this;
373  }
374 
380 
386 
389  NVBIO_HOST_DEVICE NVBIO_FORCEINLINE uint2 get_range(const uint32 i) const { return make_uint2( sequence_index()[i], sequence_index()[i+1] ); }
390 
394 
398 
399 };
400 
404 
406 
415 {
418 
421  virtual ~SequenceData() {}
422 
425  virtual operator plain_view_type() { return plain_view_type(); }
426 
429  virtual operator const_plain_view_type() const { return const_plain_view_type(); }
430 };
431 
435 template <typename system_tag>
437 {
439 
442 
447 
452 
456 
459  template <typename other_tag>
461  {
462  // copy
463  this->operator=( other );
464  }
465 
468  template <
469  typename IndexIterator,
470  typename SequenceStorageIterator,
471  typename QualStorageIterator,
472  typename NameStorageIterator>
474  {
475  // copy
476  this->operator=( other );
477  }
478 
481  SequenceDataStorage(const SequenceData& other);
482 
485  template <typename other_tag>
487  {
488  // copy the info
489  this->SequenceDataInfo::operator=( other );
490 
491  // copy the vectors
494  m_qual_vec = other.m_qual_vec;
495  m_name_vec = other.m_name_vec;
497  return *this;
498  }
499 
504  template <
505  typename IndexIterator,
506  typename SequenceStorageIterator,
507  typename QualStorageIterator,
508  typename NameStorageIterator>
510  {
511  // copy the info
512  this->SequenceDataInfo::operator=( other );
513 
514  // resize the vectors
516  m_sequence_index_vec.resize( m_n_seqs + 1u );
517  m_name_vec.resize( m_name_stream_len );
518  m_name_index_vec.resize( m_n_seqs + 1u );
519  if (m_has_qualities)
521 
522  // and copy the contents
524  thrust::copy( other.sequence_index(), other.sequence_index() + m_n_seqs + 1u, m_sequence_index_vec.begin() );
525  thrust::copy( other.name_stream(), other.name_stream() + m_name_stream_len, m_name_vec.begin() );
526  thrust::copy( other.name_index(), other.name_index() + m_n_seqs + 1u, m_name_index_vec.begin() );
527  if (m_has_qualities)
529 
530  return *this;
531  }
534  operator plain_view_type()
535  {
536  return plain_view_type(
537  static_cast<const SequenceDataInfo&>( *this ),
543  }
546  operator const_plain_view_type() const
547  {
548  return const_plain_view_type(
549  static_cast<const SequenceDataInfo&>( *this ),
555  }
556 
557  // reserve enough storage for a given number of reads and bps
558  //
559  void reserve(const uint32 n_seqs, const uint32 n_bps)
560  {
561  // a default read id length used to reserve enough space upfront and avoid frequent allocations
562  const uint32 AVG_NAME_LENGTH = 250;
563 
564  const uint32 bps_per_word = 32u / bits_per_symbol( SequenceDataInfo::m_alphabet );
565 
566  m_sequence_index_vec.reserve( n_seqs+1 );
567  m_sequence_vec.reserve( n_bps / bps_per_word );
568  m_qual_vec.reserve( n_bps );
569  m_name_index_vec.reserve( AVG_NAME_LENGTH * n_seqs );
570  m_name_index_vec.reserve( n_seqs+1 );
571  }
572 
578 
584 
590 };
591 
594 
599 {
603 
606  virtual int next(struct SequenceDataEncoder* encoder, const uint32 batch_size, const uint32 batch_bps = uint32(-1)) = 0;
607 
610  virtual bool is_ok() = 0;
611 
614  virtual bool rewind() = 0;
615 };
616 
620 
624 int next(const Alphabet alphabet, SequenceDataHost* data, SequenceDataInputStream* stream, const uint32 batch_size, const uint32 batch_bps = uint32(-1));
625 
629 int append(const Alphabet alphabet, SequenceDataHost* data, SequenceDataInputStream* stream, const uint32 batch_size, const uint32 batch_bps = uint32(-1));
630 
634 int skip(SequenceDataInputStream* stream, const uint32 batch_size);
635 
650  const char* sequence_file_name,
651  const QualityEncoding qualities = Phred33,
652  const uint32 max_seqs = uint32(-1),
653  const uint32 max_sequence_len = uint32(-1),
654  const SequenceEncoding flags = FORWARD,
655  const uint32 trim3 = 0,
656  const uint32 trim5 = 0);
657 
667 bool load_sequence_file(
668  const Alphabet alphabet,
669  SequenceDataHost* sequence_data,
670  const char* sequence_file_name,
672  const QualityEncoding qualities = Phred33);
673 
683  const Alphabet alphabet,
684  const char* sequence_file_name,
686  const QualityEncoding qualities = Phred33);
687 
692 {
696 
699  virtual void next(const SequenceDataHost& sequence_data) = 0;
700 
703  virtual bool is_ok() = 0;
704 };
705 
712 SequenceDataOutputStream* open_output_sequence_file(
713  const char* sequence_file_name,
714  const char* compression);
715 
718 
719 } // namespace io
720 
721 // return a plain view of a SequenceData object
722 //
723 inline
725 {
726  return io::SequenceData::plain_view_type( sequence_data );
727 }
728 
729 // return a plain view of a const SequenceData object
730 //
731 inline
733 {
734  return io::SequenceData::const_plain_view_type( sequence_data );
735 }
736 
739 template <typename system_tag>
740 typename io::SequenceDataStorage<system_tag>::plain_view_type
742 {
743  return typename io::SequenceDataStorage<system_tag>::plain_view_type( sequence_data );
744 }
745 
748 template <typename system_tag>
749 typename io::SequenceDataStorage<system_tag>::const_plain_view_type
751 {
752  return typename io::SequenceDataStorage<system_tag>::const_plain_view_type( sequence_data );
753 }
754 
755 namespace io {
756 
757 // copy constructor
758 //
759 template <typename system_tag>
761 {
762  // copy
763  this->operator=( plain_view( other ) );
764 }
765 
766 } // namespace io
767 
768 } // namespace nvbio
769