NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
kmers.h
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #pragma once
29 
30 #include "assembly_types.h"
31 
32 using namespace nvbio;
33 
34 #define MAX_TEMP_SPACE_BATCHES 5
35 
36 // set of kmers from a set of sequences
37 template <typename string_set_type>
38 struct D_KmerSet
39 {
40  const string_set_type string_set; // string set from which the kmers are extracted
41  D_VectorU32 active_region_ids; // active region id of each sequence in the set
42  uint32 kmer_size; // kmer size used to extract the kmers
43 
44  D_VectorSetKmerCoord coords; // coordinates in the sequence set
45  D_VectorU64 kmers_64b; // kmer keys: 64-bit compacted kmer sequence
46  D_VectorU64 kmers_64b_distinct; // distinct kmer keys
47 
48  D_VectorU32 kmers_64b_unique_idxs; // indices to unique kmers in the sorted coordinate vector
49  D_VectorU32 kmers_64b_repeat_idxs; // indices to unique kmers in the sorted coordinate vector
50  D_VectorU32 global_to_sorted_id_map;// map[i] = sorted array index of coord with global id i
51  D_VectorU32 global_to_UID_map; // map from global id to the kmer UID
52  D_VectorU8 global_unique_flag_map; // map[i] = 1 if kmer with global id i is unique; 0, otherwise
53 
54  D_VectorU32 kmers_counts; // number of occurrences of each kmer in the set
55 
56  uint32 n_kmers; // total number of kmers in the set
57  uint32 n_distinct; // number of distinct kmer keys in the set
58  uint32 n_unique; // number of distinct unique kmers (unique := kmer does not occur more than once in a sequence)
59  uint32 n_repeat; // number of non-unique kmers that are extracted as graph nodes
60 
61 
62  // super kmers (to handle non-unique kmers)
67 
68  // scratch space
72 
73  D_KmerSet(): kmer_size(0), n_kmers(0), n_distinct(0), n_unique(0), n_repeat(0), n_super_coords(0),
74  n_alloc(0), selector(0) {}
75  D_KmerSet(const string_set_type _string_set, const uint32 _kmer_size):
76  string_set(_string_set), kmer_size(_kmer_size),
77  n_kmers(0), n_distinct(0), n_unique(0), n_repeat(0), n_super_coords(0),
78  n_alloc(0), selector(0) { }
79  D_KmerSet(const string_set_type _string_set, const D_VectorU32& _active_region_ids):
80  string_set(_string_set), active_region_ids(_active_region_ids),
81  kmer_size(0), n_kmers(0), n_distinct(0), n_unique(0), n_repeat(0), n_super_coords(0),
82  n_alloc(0), selector(0) { }
83 
84  void gen_kmer_coords();
85  void gen_kmer_64b_keys();
86  void sort_kmers_by_64b_keys();
87  void segmented_sort_kmers_by_64b_keys();
88  template <typename meta_iterator_type>
89  void sort_kmers_by_64b_keys_meta(const meta_iterator_type meta_data);
90  void sort_kmers_by_64b_keys_seqid();
91  template <typename meta_iterator_type>
92  void sort_kmers_by_64b_keys_seqid_meta(const meta_iterator_type meta_data);
93  void count_kmers_rle();
94  void count_kmers();
95  void count_distinct_by_prefix(D_VectorU32& prefix_unique_id_map);
96  void partition_kmers_by_uniqueness();
97  void gen_prefix_map();
98  void gen_global_unique_map();
99  void gen_global_UID_map();
100  void gen_global_to_sorted_id_map();
101  void mark_unique_kmers();
102  void filter_coords_by_prefix_uniqueness(const D_VectorU8& unique_map);
103  void extract_super_kmers();
104  void collapse_and_extract_non_overlaps(D_VectorSetKmerCoord& kmers_out, D_VectorU32& prefix_ids_out,
105  D_VectorU32& suffix_ids_out, D_VectorU32& counts_out);
106  void count_distinct_by_prefix();
107 
108 
110  {
111  n_alloc = MAX_TEMP_SPACE_BATCHES;
112  scratch_u32.resize(n_alloc*n_kmers);
113  }
114 
115  D_VectorU32::iterator get_scratch_space(uint32 size)
116  {
117  if((size > n_kmers) || ((selector + 1) > MAX_TEMP_SPACE_BATCHES)) {
118  printf("Requested more memory than batch size \n");
119  exit(-1);
120  }
121  selector++;
122  return scratch_u32.begin() + (selector-1)*n_kmers;
123  }
124 
126  {
127  selector = 0;
128  }
129 };
130 
131 #include "kmers_inl.h"