NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
filter_inl.h
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #pragma once
29 
30 namespace nvbio {
31 
32 namespace qgram {
33 
34 // return the size of a given range
35 struct range_size
36 {
37  typedef uint2 argument_type;
39 
41  uint64 operator() (const uint2 range) const { return range.y - range.x; }
42 };
43 
44 // given a (qgram-pos, text-pos) pair, return the closest regularly-spaced diagonal
45 template <typename hit_type>
46 struct closest_diagonal {};
47 
48 // given a (qgram-pos, text-pos) pair, return the closest regularly-spaced diagonal
49 template <>
50 struct closest_diagonal<uint2>
51 {
52  typedef uint2 argument_type;
54 
56  closest_diagonal(const uint32 _interval) : interval(_interval) {}
57 
59  result_type operator() (const uint2 range) const
60  {
61  const uint32 diag = /*qgram_index_string_size + */ range.y - range.x;
62  return util::round( diag, interval );
63  }
64 
66 };
67 
68 // given a (qgram-id, q-gram-pos, text-pos) pair, return the closest regularly-spaced diagonal
69 // note:
70 // in this case the output type is a (qgram-id,diag) uint2
71 //
72 template <>
73 struct closest_diagonal<uint4>
74 {
75  typedef uint4 argument_type;
76  typedef uint2 result_type;
77 
79  closest_diagonal(const uint32 _interval) : interval(_interval) {}
80 
82  result_type operator() (const uint4 range) const
83  {
84  const uint32 diag = /*qgram_index.string_length(range.x) + */ range.z - range.y;
85  const uint32 rounded_diag = util::round( diag, interval );
86 
87  return make_uint2( rounded_diag, range.x );
88  }
89 
91 };
92 
93 template <typename qgram_index_type, typename index_iterator, typename coord_type>
94 struct filter_results {};
95 
96 template <typename qgram_index_type, typename index_iterator>
97 struct filter_results< qgram_index_type, index_iterator, uint32 >
98 {
100  typedef uint2 result_type;
101 
102  // constructor
104  const qgram_index_type _qgram_index,
105  const uint32 _n_queries,
106  const uint64* _slots,
107  const uint2* _ranges,
108  const index_iterator _index) :
109  qgram_index ( _qgram_index ),
110  n_queries ( _n_queries ),
111  slots ( _slots ),
112  ranges ( _ranges ),
113  index ( _index ) {}
114 
115  // functor operator
117  result_type operator() (const uint64 output_index) const
118  {
119  // find the text q-gram slot corresponding to this output index
120  const uint32 slot = uint32( upper_bound(
121  output_index,
122  slots,
123  n_queries ) - slots );
124 
125  // fetch the corresponding text position
126  const uint32 text_pos = index[ slot ];
127 
128  // locate the hit q-gram position
129  const uint2 range = ranges[ slot ];
130  const uint64 base_slot = slot ? slots[ slot-1 ] : 0u;
131  const uint32 local_index = output_index - base_slot;
132 
133  const uint32 qgram_pos = qgram_index.locate( range.x + local_index );
134 
135  // and write out the pair (qgram_pos,text_pos)
136  return make_uint2( qgram_pos, text_pos );
137  }
138 
139  const qgram_index_type qgram_index;
141  const uint64* slots;
142  const uint2* ranges;
143  const index_iterator index;
144 };
145 
146 template <typename qgram_index_type, typename index_iterator>
147 struct filter_results< qgram_index_type, index_iterator, uint2 >
148 {
150  typedef uint4 result_type;
151 
152  // constructor
154  const qgram_index_type _qgram_index,
155  const uint32 _n_queries,
156  const uint64* _slots,
157  const uint2* _ranges,
158  const index_iterator _index) :
159  qgram_index ( _qgram_index ),
160  n_queries ( _n_queries ),
161  slots ( _slots ),
162  ranges ( _ranges ),
163  index ( _index ) {}
164 
165  // functor operator
167  result_type operator() (const uint64 output_index) const
168  {
169  // find the text q-gram slot corresponding to this output index
170  const uint32 slot = uint32( upper_bound(
171  output_index,
172  slots,
173  n_queries ) - slots );
174 
175  // fetch the corresponding text position
176  const uint32 text_pos = index[ slot ];
177 
178  // locate the hit q-gram position
179  const uint2 range = ranges[ slot ];
180  const uint32 base_slot = slot ? slots[ slot-1 ] : 0u;
181  const uint32 local_index = output_index - base_slot;
182 
183  const uint2 qgram_pos = qgram_index.locate( range.x + local_index );
184 
185  // and write out the tuple (index-id,index-pos,text-pos)
186  return make_uint4( qgram_pos.x, qgram_pos.y, text_pos, 0u );
187  }
188 
189  const qgram_index_type qgram_index;
191  const uint64* slots;
192  const uint2* ranges;
193  const index_iterator index;
194 };
195 
196 } // namespace qgram
197 
198 // enact the q-gram filter
199 //
200 // \param qgram_index the q-gram index
201 // \param n_queries the number of query q-grams
202 // \param queries the query q-grams
203 // \param indices the query indices
204 //
205 template <typename qgram_index_type, typename query_iterator, typename index_iterator>
207  const qgram_index_type& qgram_index,
208  const uint32 n_queries,
209  const query_iterator queries,
210  const index_iterator indices)
211 {
212  typedef typename qgram_index_type::coord_type coord_type;
213 
214  // save the query
215  m_n_queries = n_queries;
216  m_queries = queries;
217  m_indices = indices;
218  m_qgram_index = nvbio::plain_view( qgram_index );
219 
220  // alloc enough storage for the results
221  m_ranges.resize( n_queries );
222  m_slots.resize( n_queries );
223 
224  // search the q-grams in the index, obtaining a set of ranges
226  queries,
227  queries + n_queries,
228  m_ranges.begin(),
229  m_qgram_index );
230 
231  // scan their size to determine the slots
233  thrust::make_transform_iterator( m_ranges.begin(), qgram::range_size() ),
234  thrust::make_transform_iterator( m_ranges.begin(), qgram::range_size() ) + n_queries,
235  m_slots.begin() );
236 
237  // determine the total number of occurrences
238  m_n_occurrences = m_slots[ n_queries-1 ];
239  return m_n_occurrences;
240 }
241 
242 // enumerate all hits in a given range
243 //
244 template <typename qgram_index_type, typename query_iterator, typename index_iterator>
245 template <typename hits_iterator>
247  const uint64 begin,
248  const uint64 end,
249  hits_iterator hits)
250 {
251  typedef typename qgram_index_type::coord_type coord_type;
252 
253  // and fill it
255  thrust::make_counting_iterator<uint64>(0u) + begin,
256  thrust::make_counting_iterator<uint64>(0u) + end,
257  hits,
259  m_qgram_index,
260  m_n_queries,
261  nvbio::plain_view( m_slots ),
262  nvbio::plain_view( m_ranges ),
263  m_indices ) );
264 }
265 
266 // simply convert hits to diagonal coordinates
267 //
268 // \tparam hits_iterator a hit_iterator iterator
269 // \tparam output_iterator a diagonal_type iterator
270 //
271 // \param n_hits the number of input hits
272 // \param hits the input hits
273 // \param diags the output diagonals
274 //
275 template <typename qgram_index_type, typename query_iterator, typename index_iterator>
276 template <typename hits_iterator, typename output_iterator>
278  const uint32 n_hits,
279  const hits_iterator hits,
280  output_iterator diags,
281  const uint32 interval)
282 {
283  // snap the diagonals to the closest one
285  hits,
286  hits + n_hits,
287  diags,
288  qgram::closest_diagonal<hit_type>( interval ) );
289 }
290 
291 // merge hits falling within the same diagonal interval; this method will
292 // replace the vector of hits with a compacted list of hits snapped to the
293 // closest sample diagonal (i.e. multiple of the given interval), together
294 // with a counts vector providing the number of hits falling on the same
295 // spot
296 //
297 // \param interval the merging interval
298 // \param n_hits the number of input hits
299 // \param hits the input hits
300 // \param merged_hits the output merged hits
301 // \param merged_counts the output merged counts
302 // \return the number of merged hits
303 //
304 template <typename qgram_index_type, typename query_iterator, typename index_iterator>
305 template <typename hits_iterator, typename output_iterator, typename count_iterator>
307  const uint32 interval,
308  const uint32 n_hits,
309  const hits_iterator hits,
310  output_iterator merged_hits,
311  count_iterator merged_counts)
312 {
313  m_diags.resize( n_hits );
314 
315  // convert hits to diagonals and snap them to the closest one
316  diagonals( n_hits, hits, m_diags.begin(), interval );
317 
318  // now sort the results by diagonal (which can be either a uint32 or a uint2)
319  typedef typename if_equal<diagonal_type, uint32, uint32, uint64>::type primitive_type;
320 
321  primitive_type* raw_diags( (primitive_type*)nvbio::raw_pointer( m_diags ) );
322  thrust::sort(
323  raw_diags,
324  raw_diags + n_hits );
325 
326  // and run-length encode them
327  const uint32 n_merged = uint32( thrust::reduce_by_key(
328  m_diags.begin(),
329  m_diags.begin() + n_hits,
330  thrust::make_constant_iterator<uint32>(1u),
331  merged_hits,
332  merged_counts ).first - merged_hits );
333 
334  return n_merged;
335 }
336 
337 // enact the q-gram filter
338 //
339 // \param qgram_index the q-gram index
340 // \param n_queries the number of query q-grams
341 // \param queries the query q-grams
342 // \param indices the query indices
343 //
344 template <typename qgram_index_type, typename query_iterator, typename index_iterator>
346  const qgram_index_type& qgram_index,
347  const uint32 n_queries,
348  const query_iterator queries,
349  const index_iterator indices)
350 {
351  typedef typename qgram_index_type::coord_type coord_type;
352 
353  // save the query
354  m_n_queries = n_queries;
355  m_queries = queries;
356  m_indices = indices;
357  m_qgram_index = nvbio::plain_view( qgram_index );
358 
359  // alloc enough storage for the results
360  if (m_ranges.size() < n_queries)
361  {
362  m_ranges.clear();
363  m_ranges.resize( n_queries );
364  }
365  if (m_slots.size() < n_queries)
366  {
367  m_slots.clear();
368  m_slots.resize( n_queries );
369  }
370 
371  // search the q-grams in the index, obtaining a set of ranges
373  device_iterator( queries ),
374  device_iterator( queries ) + n_queries,
375  m_ranges.begin(),
376  m_qgram_index );
377 
378  // scan their size to determine the slots
380  n_queries,
381  thrust::make_transform_iterator( m_ranges.begin(), qgram::range_size() ),
382  m_slots.begin(),
383  thrust::plus<uint64>(),
384  d_temp_storage );
385 
386  // determine the total number of occurrences
387  m_n_occurrences = m_slots[ n_queries-1 ];
388  return m_n_occurrences;
389 }
390 
391 // enumerate all hits in a given range
392 //
393 template <typename qgram_index_type, typename query_iterator, typename index_iterator>
394 template <typename hits_iterator>
396  const uint64 begin,
397  const uint64 end,
398  hits_iterator hits)
399 {
400  typedef typename qgram_index_type::coord_type coord_type;
401 
402  // and fill it
404  thrust::make_counting_iterator<uint64>(0u) + begin,
405  thrust::make_counting_iterator<uint64>(0u) + end,
406  device_iterator( hits ),
408  m_qgram_index,
409  m_n_queries,
410  nvbio::plain_view( m_slots ),
411  nvbio::plain_view( m_ranges ),
412  m_indices ) );
413 }
414 
415 // simply convert hits to diagonal coordinates
416 //
417 // \tparam hits_iterator a hit_iterator iterator
418 // \tparam output_iterator a diagonal_type iterator
419 //
420 // \param n_hits the number of input hits
421 // \param hits the input hits
422 // \param diags the output diagonals
423 //
424 template <typename qgram_index_type, typename query_iterator, typename index_iterator>
425 template <typename hits_iterator, typename output_iterator>
427  const uint32 n_hits,
428  const hits_iterator hits,
429  output_iterator diags,
430  const uint32 interval)
431 {
432  // snap the diagonals to the closest one
434  device_iterator( hits ),
435  device_iterator( hits ) + n_hits,
436  device_iterator( diags ),
437  qgram::closest_diagonal<hit_type>( interval ) );
438 }
439 
440 // merge hits falling within the same diagonal interval; this method will
441 // replace the vector of hits with a compacted list of hits snapped to the
442 // closest sample diagonal (i.e. multiple of the given interval), together
443 // with a counts vector providing the number of hits falling on the same
444 // spot
445 //
446 // \param interval the merging interval
447 // \param n_hits the number of input hits
448 // \param hits the input hits
449 // \param merged_hits the output merged hits
450 // \param merged_counts the output merged counts
451 // \return the number of merged hits
452 //
453 template <typename qgram_index_type, typename query_iterator, typename index_iterator>
454 template <typename hits_iterator, typename output_iterator, typename count_iterator>
456  const uint32 interval,
457  const uint32 n_hits,
458  const hits_iterator hits,
459  output_iterator merged_hits,
460  count_iterator merged_counts)
461 {
462  // copy the hits to a temporary sorting buffer
463  const uint32 buffer_size = align<32>( n_hits );
464  m_diags.resize( buffer_size * 2u );
465 
466  // convert hits to diagonals and snap them to the closest one
467  diagonals( n_hits, hits, m_diags.begin(), interval );
468 
469  // now sort the results by diagonal (which can be either a uint32 or a uint2)
470  typedef typename if_equal<diagonal_type, uint32, uint32, uint64>::type primitive_type;
471 
472  primitive_type* raw_diags( (primitive_type*)nvbio::raw_pointer( m_diags ) );
473 
475  sort_buffers.keys[0] = raw_diags;
476  sort_buffers.keys[1] = raw_diags + buffer_size;
477 
478  cuda::SortEnactor sort_enactor;
479  sort_enactor.sort( n_hits, sort_buffers );
480 
481  // and run-length encode them
482  const uint32 n_merged = cuda::runlength_encode(
483  n_hits,
484  m_diags.begin() + sort_buffers.selector * buffer_size,
485  merged_hits,
486  merged_counts,
487  d_temp_storage );
488 
489  return n_merged;
490 }
491 
492 } // namespace nvbio