NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
qgram.h
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #pragma once
29 
30 #include <nvbio/basic/types.h>
31 #include <nvbio/basic/numbers.h>
32 #include <nvbio/basic/algorithms.h>
35 #include <nvbio/basic/exceptions.h>
36 #include <nvbio/basic/iterator.h>
37 #include <nvbio/basic/vector.h>
38 #include <nvbio/strings/seeds.h>
39 #include <thrust/host_vector.h>
40 #include <thrust/device_vector.h>
41 #include <thrust/sort.h>
42 #include <thrust/for_each.h>
43 #include <thrust/binary_search.h>
44 #include <thrust/iterator/constant_iterator.h>
45 #include <thrust/iterator/counting_iterator.h>
46 
333 
334 namespace nvbio {
335 
351 
366 template <typename string_type, typename index_iterator, typename qgram_iterator>
367 void generate_qgrams(
368  const uint32 q,
369  const uint32 symbol_size,
370  const uint32 string_len,
371  const string_type string,
372  const uint32 n_qgrams,
373  const index_iterator indices,
374  qgram_iterator qgrams);
375 
389 template <typename string_set_type, typename index_iterator, typename qgram_iterator>
390 void generate_qgrams(
391  const uint32 q,
392  const uint32 symbol_size,
393  const string_set_type string_set,
394  const uint32 n_qgrams,
395  const index_iterator indices,
396  qgram_iterator qgrams);
397 
410 
413 template <typename QGramVectorType, typename IndexVectorType, typename CoordVectorType>
415 {
416  // class typedefs
417  typedef QGramVectorType qgram_vector_type;
418  typedef IndexVectorType index_vector_type;
419  typedef CoordVectorType coord_vector_type;
420  typedef typename std::iterator_traits<qgram_vector_type>::value_type qgram_type;
421  typedef typename std::iterator_traits<coord_vector_type>::value_type coord_type;
422 
423  // plain view typedefs
426 
427  // unary functor typedefs
429  typedef uint2 result_type;
430 
433 
436  const uint32 _Q,
437  const uint32 _symbol_size,
438  const uint32 _n_qgrams,
439  const uint32 _n_unique_qgrams,
440  const qgram_vector_type _qgrams,
441  const index_vector_type _slots,
442  const coord_vector_type _index,
443  const uint32 _QL,
444  const uint32 _QLS,
445  const index_vector_type _lut) :
446  Q ( _Q ),
447  symbol_size ( _symbol_size ),
448  n_qgrams ( _n_qgrams ),
449  n_unique_qgrams ( _n_unique_qgrams ),
450  QL ( _QL ),
451  QLS ( _QLS ),
452  qgrams ( _qgrams ),
453  slots ( _slots ),
454  index ( _index ),
455  lut ( _lut ) {}
456 
460  uint2 range(const qgram_type g) const
461  {
462  const uint32 g_lut = uint32( g >> QLS );
463 
464  // consult the lookup table if there is one
465  const uint2 lut_range = lut ?
466  make_uint2( lut[ g_lut ], lut[ g_lut + 1 ] ) :
467  make_uint2( 0u, n_unique_qgrams );
468 
469  // find the slot where our q-gram is stored
470  const uint32 i = uint32( nvbio::lower_bound(
471  g,
472  qgrams + lut_range.x,
473  (lut_range.y - lut_range.x) ) - qgrams );
474 
475  // check whether we found what we are looking for
476  if ((i >= n_unique_qgrams) || (g != qgrams[i]))
477  return make_uint2( 0u, 0u );
478 
479  // return the range
480  return make_uint2( slots[i], slots[i+1] );
481  }
482 
486  uint2 operator() (const qgram_type g) const { return range( g ); }
487 
491  coord_type locate(const uint32 i) const { return index[i]; }
492 
493 
504 };
505 
508 template <typename SystemTag, typename QGramType, typename IndexType, typename CoordType>
510 {
511  typedef SystemTag system_tag;
512 
513  typedef QGramType qgram_type;
514  typedef IndexType index_type;
515  typedef CoordType coord_type;
519 
522 
524 
528  {
529  return equal<system_tag,host_tag>() ?
530  qgrams.size() * sizeof(qgram_type) +
531  slots.size() * sizeof(uint32) +
532  index.size() * sizeof(coord_type) +
533  lut.size() * sizeof(uint32) :
534  0u;
535  }
536 
540  {
541  return equal<system_tag,device_tag>() ?
542  qgrams.size() * sizeof(qgram_type) +
543  slots.size() * sizeof(uint32) +
544  index.size() * sizeof(coord_type) +
545  lut.size() * sizeof(uint32) :
546  0u;
547  }
548 
556 
560 };
561 
566 
569 struct QGramIndexHost : public QGramIndexCore<host_tag,uint64,uint32,uint32>
570 {
572 
574 
582 
585  template <typename SystemTag>
587 };
588 
591 struct QGramIndexDevice : public QGramIndexCore<device_tag,uint64,uint32,uint32>
592 {
594 
595  typedef QGramIndexCore<
596  device_tag,
597  uint64,
598  uint32,
600 
608 
620  template <typename string_type>
621  void build(
622  const uint32 q,
623  const uint32 symbol_sz,
624  const uint32 string_len,
625  const string_type string,
626  const uint32 qlut = 0);
627 
630  template <typename SystemTag>
632 };
633 
636 struct QGramSetIndexHost : public QGramIndexCore<host_tag,uint64,uint32,uint2>
637 {
639 
640  typedef QGramIndexCore<
641  host_tag,
642  uint64,
643  uint32,
644  uint2> core_type;
645 
653 
656  template <typename SystemTag>
658 };
659 
662 struct QGramSetIndexDevice : public QGramIndexCore<device_tag,uint64,uint32,uint2>
663 {
665 
666  typedef QGramIndexCore<
667  device_tag,
668  uint64,
669  uint32,
670  uint2> core_type;
671 
679 
690  template <typename string_set_type>
691  void build(
692  const uint32 q,
693  const uint32 symbol_sz,
694  const string_set_type string_set,
695  const uint32 qlut = 0);
696 
709  template <typename string_set_type, typename seed_functor>
710  void build(
711  const uint32 q,
712  const uint32 symbol_sz,
713  const string_set_type string_set,
714  const seed_functor seeder,
715  const uint32 qlut = 0);
716 
719  template <typename SystemTag>
721 };
722 
723 template<> struct plain_view_subtype<QGramIndexHost> { typedef QGramIndexView type; };
724 template<> struct plain_view_subtype<QGramIndexDevice> { typedef QGramIndexView type; };
727 
732 
735 template <typename SystemTag, typename QT, typename IT, typename CT>
737 
740 template <typename SystemTag, typename QT, typename IT, typename CT>
742 {
744  qgram.Q,
745  qgram.symbol_size,
746  qgram.n_qgrams,
747  qgram.n_unique_qgrams,
748  nvbio::plain_view( qgram.qgrams ),
749  nvbio::plain_view( qgram.slots ),
750  nvbio::plain_view( qgram.index ),
751  qgram.QL,
752  qgram.QLS,
753  nvbio::plain_view( qgram.lut ) );
754 }
755 
758 template <typename SystemTag, typename QT, typename IT, typename CT>
760 {
762  qgram.Q,
763  qgram.symbol_size,
764  qgram.n_qgrams,
765  qgram.n_unique_qgrams,
766  nvbio::plain_view( qgram.qgrams ),
767  nvbio::plain_view( qgram.slots ),
768  nvbio::plain_view( qgram.index ),
769  qgram.QL,
770  qgram.QLS,
771  nvbio::plain_view( qgram.lut ) );
772 }
773 
776 template <typename qgram_index_type>
778 {
781 
785  qgram_locate_functor(const qgram_index_type _index) : index(_index) {}
786 
790  result_type operator() (const uint32 slot) const { return index.locate( slot ); }
791 
792  const qgram_index_type index;
793 };
794 
796 
801 template <typename string_type>
803 {
806 
815  string_qgram_functor(const uint32 _Q, const uint32 _symbol_size, const uint32 _string_len, const string_type _string) :
816  Q ( _Q ),
817  symbol_size ( _symbol_size ),
818  symbol_mask ( (1u << _symbol_size) - 1u ),
819  string_len ( _string_len ),
820  string ( _string ) {}
821 
827  uint64 operator() (const uint32 i) const
828  {
829  uint64 qgram = 0u;
830  for (uint32 j = 0; j < Q; ++j)
831  qgram |= uint64(i+j < string_len ? (string[i + j] & symbol_mask) : 0u) << (j*symbol_size);
832 
833  return qgram;
834  }
835 
836  const uint32 Q;
840  const string_type string;
841 };
842 
847 template <typename string_set_type>
849 {
852 
853  typedef typename string_set_type::string_type string_type;
854 
862  string_set_qgram_functor(const uint32 _Q, const uint32 _symbol_size, const string_set_type _string_set) :
863  Q ( _Q ),
864  symbol_size ( _symbol_size ),
865  symbol_mask ( (1u << _symbol_size) - 1u ),
866  string_set ( _string_set ) {}
867 
873  uint64 operator() (const uint2 id) const
874  {
875  const uint32 string_id = id.x;
876  const uint32 string_pos = id.y;
877  const string_type string = string_set[ string_id ];
878 
879  const uint32 string_len = string.length();
880 
881  uint64 qgram = 0u;
882  for (uint32 j = 0; j < Q; ++j)
883  qgram |= uint64(string_pos + j < string_len ? (string[string_pos + j] & symbol_mask) : 0u) << (j*symbol_size);
884 
885  return qgram;
886  }
887 
888  const uint32 Q;
891  const string_set_type string_set;
892 };
893 
896 template <typename qgram_index_type, typename string_type>
898 {
900  typedef uint2 result_type;
901 
905  const qgram_index_type _qgram_index,
906  const uint32 _string_len,
907  const string_type _string) :
908  qgram_index ( _qgram_index ),
909  string_len ( _string_len ),
910  string ( _string ) {}
911 
915  uint2 operator() (const uint32 i) const
916  {
917  const string_qgram_functor<string_type> qgram( qgram_index.Q, qgram_index.symbol_size, string_len, string );
918 
919  return qgram_index.range( qgram(i) );
920  }
921 
922  const qgram_index_type qgram_index;
924  const string_type string;
925 };
926 
928 
929 } // namespace nvbio
930 
931 #include <nvbio/qgram/qgram_inl.h>