NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
qgram_inl.h
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #pragma once
29 
30 namespace nvbio {
31 
32 // build a q-group index from a given string
33 //
34 // \param q the q parameter
35 // \param string_len the size of the string
36 // \param string the string iterator
37 //
38 template <typename string_type>
40  const uint32 q,
41  const uint32 symbol_sz,
42  const uint32 string_len,
43  const string_type string,
44  const uint32 qlut)
45 {
46  thrust::device_vector<uint8> d_temp_storage;
47 
48  symbol_size = symbol_sz;
49  Q = q;
50  QL = qlut;
51  QLS = (Q - QL) * symbol_size;
52 
53  n_qgrams = string_len;
54 
55  qgrams.resize( string_len );
56  index.resize( string_len );
57 
58  thrust::device_vector<qgram_type> d_all_qgrams( align<32>( string_len ) * 2u );
59  thrust::device_vector<uint32> d_temp_index( string_len );
60 
61  // build the list of q-grams
63  thrust::make_counting_iterator<uint32>(0u),
64  thrust::make_counting_iterator<uint32>(0u) + string_len,
65  d_all_qgrams.begin(),
66  string_qgram_functor<string_type>( Q, symbol_size, string_len, string ) );
67 
68  // build the list of q-gram indices
70  thrust::make_counting_iterator<uint32>(0u),
71  thrust::make_counting_iterator<uint32>(0u) + string_len,
72  index.begin() );
73 
74  // create the ping-pong sorting buffers
75  cub::DoubleBuffer<qgram_type> key_buffers;
76  cub::DoubleBuffer<uint32> value_buffers;
77 
78  key_buffers.selector = 0;
79  value_buffers.selector = 0;
80  key_buffers.d_buffers[0] = nvbio::raw_pointer( d_all_qgrams );
81  key_buffers.d_buffers[1] = nvbio::raw_pointer( d_all_qgrams ) + align<32>( string_len );
82  value_buffers.d_buffers[0] = nvbio::raw_pointer( index );
83  value_buffers.d_buffers[1] = nvbio::raw_pointer( d_temp_index );
84 
85  size_t temp_storage_bytes = 0;
86 
87  // gauge the amount of temp storage we need
88  cub::DeviceRadixSort::SortPairs( NULL, temp_storage_bytes, key_buffers, value_buffers, string_len, 0u, Q * symbol_size );
89 
90  // resize the temp storage vector
91  d_temp_storage.clear();
92  d_temp_storage.resize( temp_storage_bytes );
93 
94  // do the real run
95  cub::DeviceRadixSort::SortPairs( nvbio::raw_pointer( d_temp_storage ), temp_storage_bytes, key_buffers, value_buffers, string_len, 0u, Q * symbol_size );
96 
97  // swap the index vector if needed
98  if (value_buffers.selector)
99  index.swap( d_temp_index );
100 
101  // copy only the unique q-grams and count them
102  thrust::device_vector<uint32> d_counts( string_len + 1u );
103 
105  string_len,
106  key_buffers.d_buffers[ key_buffers.selector ],
107  qgrams.begin(),
108  d_counts.begin(),
109  d_temp_storage );
110 
111  // now we know how many unique q-grams there are
112  slots.resize( n_unique_qgrams + 1u );
113 
114  // scan the counts to get the slots
116  n_unique_qgrams + 1u,
117  d_counts.begin(),
118  slots.begin(),
119  thrust::plus<uint32>(),
120  uint32(0),
121  d_temp_storage );
122 
123  // shrink the q-gram vector
124  qgrams.resize( n_unique_qgrams );
125 
126  const uint32 n_slots = slots[ n_unique_qgrams ];
127  if (n_slots != string_len)
128  throw runtime_error( "mismatching number of q-grams: inserted %u q-grams, got: %u\n" );
129 
130  //
131  // build a LUT
132  //
133 
134  if (QL)
135  {
136  const uint32 ALPHABET_SIZE = 1u << symbol_size;
137 
138  uint64 lut_size = 1;
139  for (uint32 i = 0; i < QL; ++i)
140  lut_size *= ALPHABET_SIZE;
141 
142  // and now search them
143  lut.resize( lut_size+1 );
144 
146  qgrams.begin(),
147  qgrams.begin() + n_unique_qgrams,
148  thrust::make_transform_iterator( thrust::make_counting_iterator<uint32>(0), shift_left<qgram_type>( QLS ) ),
149  thrust::make_transform_iterator( thrust::make_counting_iterator<uint32>(0), shift_left<qgram_type>( QLS ) ) + lut_size,
150  lut.begin() );
151 
152  // and write a sentinel value
153  lut[ lut_size ] = n_unique_qgrams;
154  }
155  else
156  lut.resize(0);
157 }
158 
159 // A functor to localize a string-set index
160 //
161 template <typename string_set_type>
163 {
165  typedef uint2 result_type;
166 
167  // constructor
168  //
170  localize_functor(const string_set_type _string_set, const uint32* _cum_lengths) :
171  string_set(_string_set), cum_lengths(_cum_lengths) {}
172 
173  // return the length of the i-th string
174  //
176  uint2 operator() (const uint32 global_idx) const
177  {
178  const uint32 string_id = uint32( upper_bound( global_idx, cum_lengths, string_set.size() ) - cum_lengths );
179 
180  const uint32 base_offset = string_id ? cum_lengths[ string_id-1 ] : 0u;
181 
182  return make_uint2( string_id, global_idx - base_offset );
183  }
184 
185  const string_set_type string_set;
187 };
188 
189 // build a q-group index from a given string set
190 //
191 // \param q the q parameter
192 // \param string-set the string-set
193 //
194 template <typename string_set_type, typename seed_functor>
196  const uint32 q,
197  const uint32 symbol_sz,
198  const string_set_type string_set,
199  const seed_functor seeder,
200  const uint32 qlut)
201 {
202  thrust::device_vector<uint8> d_temp_storage;
203 
204  symbol_size = symbol_sz;
205  Q = q;
206  QL = qlut;
207  QLS = (Q - QL) * symbol_size;
208 
209  // extract the list of q-gram coordinates
211  string_set,
212  seeder,
213  index );
214 
215  thrust::device_vector<qgram_type> d_all_qgrams( align<32>( n_qgrams ) * 2u );
216  thrust::device_vector<uint2> d_temp_index( n_qgrams );
217 
218  // build the list of q-grams
220  index.begin(),
221  index.begin() + n_qgrams,
222  d_all_qgrams.begin(),
224 
225  // create the ping-pong sorting buffers
226  cub::DoubleBuffer<qgram_type> key_buffers;
227  cub::DoubleBuffer<uint64> value_buffers;
228 
229  key_buffers.selector = 0;
230  value_buffers.selector = 0;
231  key_buffers.d_buffers[0] = nvbio::raw_pointer( d_all_qgrams );
232  key_buffers.d_buffers[1] = nvbio::raw_pointer( d_all_qgrams ) + align<32>( n_qgrams );
233  value_buffers.d_buffers[0] = (uint64*)nvbio::raw_pointer( index );
234  value_buffers.d_buffers[1] = (uint64*)nvbio::raw_pointer( d_temp_index );
235 
236  size_t temp_storage_bytes = 0;
237 
238  // gauge the amount of temp storage we need
239  cub::DeviceRadixSort::SortPairs( NULL, temp_storage_bytes, key_buffers, value_buffers, n_qgrams, 0u, Q * symbol_size );
240 
241  // resize the temp storage vector
242  d_temp_storage.clear();
243  d_temp_storage.resize( temp_storage_bytes );
244 
245  // do the real run
246  cub::DeviceRadixSort::SortPairs( nvbio::raw_pointer( d_temp_storage ), temp_storage_bytes, key_buffers, value_buffers, n_qgrams, 0u, Q * symbol_size );
247 
248  // swap the index vector if needed
249  if (value_buffers.selector)
250  index.swap( d_temp_index );
251 
252  // reserve enough storage for the output q-grams
253  qgrams.resize( n_qgrams );
254 
255  // copy only the unique q-grams and count them
256  thrust::device_vector<uint32> d_counts( n_qgrams + 1u );
257 
259  n_qgrams,
260  key_buffers.d_buffers[ key_buffers.selector ],
261  qgrams.begin(),
262  d_counts.begin(),
263  d_temp_storage );
264 
265  // now we know how many unique q-grams there are
266  slots.resize( n_unique_qgrams + 1u );
267 
268  // scan the counts to get the slots
270  n_unique_qgrams + 1u,
271  d_counts.begin(),
272  slots.begin(),
273  thrust::plus<uint32>(),
274  uint32(0),
275  d_temp_storage );
276 
277  // shrink the q-gram vector
278  qgrams.resize( n_unique_qgrams );
279 
280  const uint32 n_slots = slots[ n_unique_qgrams ];
281  if (n_slots != n_qgrams)
282  throw runtime_error( "mismatching number of q-grams: inserted %u q-grams, got: %u\n" );
283 
284  //
285  // build a LUT
286  //
287 
288  if (QL)
289  {
290  const uint32 ALPHABET_SIZE = 1u << symbol_size;
291 
292  uint64 lut_size = 1;
293  for (uint32 i = 0; i < QL; ++i)
294  lut_size *= ALPHABET_SIZE;
295 
296  // build a set of spaced q-grams and search them
297  lut.resize( lut_size+1 );
298 
300  qgrams.begin(),
301  qgrams.begin() + n_unique_qgrams,
302  thrust::make_transform_iterator( thrust::make_counting_iterator<uint32>(0), shift_left<qgram_type>( QLS ) ),
303  thrust::make_transform_iterator( thrust::make_counting_iterator<uint32>(0), shift_left<qgram_type>( QLS ) ) + lut_size,
304  lut.begin() );
305 
306  // and write a sentinel value
307  lut[ lut_size ] = n_unique_qgrams;
308  }
309  else
310  lut.resize(0);
311 }
312 
313 // build a q-group index from a given string set
314 //
315 // \param q the q parameter
316 // \param string-set the string-set
317 //
318 template <typename string_set_type>
320  const uint32 q,
321  const uint32 symbol_sz,
322  const string_set_type string_set,
323  const uint32 qlut)
324 {
325  build(
326  q,
327  symbol_sz,
328  string_set,
329  uniform_seeds_functor<>( q, 1u ),
330  qlut );
331 }
332 
333 // copy operator
334 //
335 template <typename SystemTag>
337 {
338  Q = src.Q;
339  symbol_size = src.symbol_size;
340  n_qgrams = src.n_qgrams;
342  qgrams = src.qgrams;
343  slots = src.slots;
344  index = src.index;
345  QL = src.QL;
346  QLS = src.QLS;
347  lut = src.lut;
348  return *this;
349 }
350 
351 // copy operator
352 //
353 template <typename SystemTag>
355 {
356  Q = src.Q;
357  symbol_size = src.symbol_size;
358  n_qgrams = src.n_qgrams;
360  qgrams = src.qgrams;
361  slots = src.slots;
362  index = src.index;
363  QL = src.QL;
364  QLS = src.QLS;
365  lut = src.lut;
366  return *this;
367 }
368 
369 // copy operator
370 //
371 template <typename SystemTag>
373 {
374  Q = src.Q;
375  symbol_size = src.symbol_size;
377  qgrams = src.qgrams;
378  slots = src.slots;
379  index = src.index;
380  QL = src.QL;
381  QLS = src.QLS;
382  lut = src.lut;
383  return *this;
384 }
385 
386 // copy operator
387 //
388 template <typename SystemTag>
390 {
391  Q = src.Q;
392  symbol_size = src.symbol_size;
394  qgrams = src.qgrams;
395  slots = src.slots;
396  index = src.index;
397  QL = src.QL;
398  QLS = src.QLS;
399  lut = src.lut;
400  return *this;
401 }
402 
403 // generate the q-grams corresponding to a list of q-gram coordinates
404 //
405 // \tparam string_type a string iterator
406 // \tparam index_iterator a q-gram coordinate iterator
407 // \tparam qgram_iterator a q-gram iterator
408 //
409 // \param q the q-gram length
410 // \param symbol_size the symbol size, in bits
411 // \param string_len the input string length
412 // \param string the input string
413 // \param n_qgrams the number of q-grams to generate
414 // \param indices the input q-gram coordinates
415 // \param indices the output q-grams
416 //
417 template <typename string_type, typename index_iterator, typename qgram_iterator>
419  const uint32 q,
420  const uint32 symbol_size,
421  const uint32 string_len,
422  const string_type string,
423  const uint32 n_qgrams,
424  const index_iterator indices,
425  qgram_iterator qgrams)
426 {
428  indices,
429  indices + n_qgrams,
430  qgrams,
431  string_qgram_functor<string_type>( q, symbol_size, string_len, string ) );
432 }
433 
434 // generate the q-grams corresponding to a list of q-gram coordinates
435 //
436 // \tparam string_type a string iterator
437 // \tparam index_iterator a q-gram coordinate iterator
438 // \tparam qgram_iterator a q-gram iterator
439 //
440 // \param q the q-gram length
441 // \param symbol_size the symbol size, in bits
442 // \param string_set the input string-set
443 // \param n_qgrams the number of q-grams to generate
444 // \param indices the input q-gram coordinates
445 // \param indices the output q-grams
446 //
447 template <typename string_set_type, typename index_iterator, typename qgram_iterator>
449  const uint32 q,
450  const uint32 symbol_size,
451  const string_set_type string_set,
452  const uint32 n_qgrams,
453  const index_iterator indices,
454  qgram_iterator qgrams)
455 {
457  indices,
458  indices + n_qgrams,
459  qgrams,
460  string_set_qgram_functor<string_set_type>( q, symbol_size, string_set ) );
461 }
462 
463 } // namespace nvbio