NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
filter_inl.h
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #pragma once
29 
30 namespace nvbio {
31 
32 namespace fmindex {
33 
34 // return the size of a given range
35 template <typename range_type>
36 struct range_size
37 {
38  typedef range_type argument_type;
40 
42  uint64 operator() (const range_type range) const { return 1u + range.y - range.x; }
43 };
44 
45 template <typename index_type, typename string_set_type>
47 {
48  typedef typename index_type::range_type range_type;
49 
52 
53  // constructor
56  const index_type _index,
57  const string_set_type _string_set) :
58  index ( _index ),
59  string_set ( _string_set ) {}
60 
61  // functor operator
64  {
65  typedef typename string_set_type::string_type string_type;
66 
67  // fetch the given string
68  const string_type string = string_set[ string_id ];
69 
70  // and match it in the FM-index
71  return match( index, string, length( string ) );
72  }
73 
74  const index_type index;
75  const string_set_type string_set;
76 };
77 
78 template <typename range_type>
80 {
82 
84  typedef range_type result_type;
85 
86  // constructor
89  const uint32 _n_queries,
90  const uint64* _slots,
91  const range_type* _ranges) :
92  n_queries ( _n_queries ),
93  slots ( _slots ),
94  ranges ( _ranges ) {}
95 
96  // functor operator
98  result_type operator() (const uint64 output_index) const
99  {
100  // find the text q-gram slot corresponding to this output index
101  const uint32 slot = uint32( upper_bound(
102  output_index,
103  slots,
104  n_queries ) - slots );
105 
106  // fetch the corresponding text position
107  const uint32 string_id = slot;
108 
109  // locate the hit position
110  const range_type range = ranges[ slot ];
111  const uint64 base_slot = slot ? slots[ slot-1 ] : 0u;
112  const uint32 local_index = output_index - base_slot;
113 
114  // and write out the pair (qgram_pos,text_pos)
115  return make_vector( coord_type( range.x + local_index ), coord_type( string_id ) );
116  }
117 
119  const uint64* slots;
120  const range_type* ranges;
121 };
122 
123 template <typename index_type>
125 {
126  typedef typename index_type::range_type range_type;
127 
130 
131  // constructor
133  locate_results(const index_type _index) : index( _index ) {}
134 
135  // functor operator
138  {
139  return make_vector( locate( index, pair.x ), pair.y );
140  }
141 
142  const index_type index;
143 };
144 
145 template <typename index_type>
147 {
148  typedef typename index_type::range_type range_type;
149 
152 
153  // constructor
155  locate_ssa_results(const index_type _index) : index( _index ) {}
156 
157  // functor operator
160  {
161  return locate_ssa_iterator( index, pair.x );
162  }
163 
164  const index_type index;
165 };
166 
167 template <typename index_type>
169 {
170  typedef typename index_type::range_type range_type;
171 
174  typedef range_type result_type; // TODO: this should be the filter's hit_type
175 
176  // constructor
178  lookup_ssa_results(const index_type _index) : index( _index ) {}
179 
180  // functor operator
182  result_type operator() (const range_type pair, const range_type ssa) const
183  {
184  return make_vector( lookup_ssa_iterator( index, ssa ), pair.y );
185  }
186 
187  const index_type index;
188 };
189 
190 } // namespace fmindex
191 
192 
193 // enact the filter on an FM-index and a string-set
194 //
195 // \param fm_index the FM-index
196 // \param string-set the query string-set
197 //
198 // \return the total number of hits
199 //
200 template <typename fm_index_type>
201 template <typename string_set_type>
203  const fm_index_type& index,
204  const string_set_type& string_set)
205 {
206  // save the query
207  m_n_queries = string_set.size();
208  m_index = index;
209 
210  // alloc enough storage for the results
211  m_ranges.resize( m_n_queries );
212  m_slots.resize( m_n_queries );
213 
214  // search the strings in the index, obtaining a set of ranges
216  thrust::make_counting_iterator<uint32>(0u),
217  thrust::make_counting_iterator<uint32>(0u) + m_n_queries,
218  m_ranges.begin(),
220 
221  // scan their size to determine the slots
224  thrust::make_transform_iterator( m_ranges.begin(), fmindex::range_size<range_type>() ) + m_n_queries,
225  m_slots.begin() );
226 
227  // determine the total number of occurrences
228  m_n_occurrences = m_slots[ m_n_queries-1 ];
229  return m_n_occurrences;
230 }
231 
232 // enumerate all hits in a given range
233 //
234 // \tparam hits_iterator a hit_type iterator
235 //
236 template <typename fm_index_type>
237 template <typename hits_iterator>
239  const uint64 begin,
240  const uint64 end,
241  hits_iterator hits)
242 {
243  // fill the output hits with (SA,string-id) coordinates
245  thrust::make_counting_iterator<uint64>(0u) + begin,
246  thrust::make_counting_iterator<uint64>(0u) + end,
247  hits,
249  m_n_queries,
250  nvbio::plain_view( m_slots ),
251  nvbio::plain_view( m_ranges ) ) );
252 
253  // and locate the SA coordinates
255  hits,
256  hits + (end - begin),
257  hits,
259 }
260 
261 // enact the filter on an FM-index and a string-set
262 //
263 // \param fm_index the FM-index
264 // \param string-set the query string-set
265 //
266 // \return the total number of hits
267 //
268 template <typename fm_index_type>
269 template <typename string_set_type>
271  const fm_index_type& index,
272  const string_set_type& string_set)
273 {
274  // save the query
275  m_n_queries = string_set.size();
276  m_index = index;
277 
278  // alloc enough storage for the results
279  m_ranges.resize( m_n_queries );
280  m_slots.resize( m_n_queries );
281 
282  // search the strings in the index, obtaining a set of ranges
284  thrust::make_counting_iterator<uint32>(0u),
285  thrust::make_counting_iterator<uint32>(0u) + m_n_queries,
286  m_ranges.begin(),
288 
289  // scan their size to determine the slots
291  m_n_queries,
293  m_slots.begin(),
294  thrust::plus<uint64>(),
295  d_temp_storage );
296 
297  // determine the total number of occurrences
298  m_n_occurrences = m_slots[ m_n_queries-1 ];
299  return m_n_occurrences;
300 }
301 
302 // enumerate all hits in a given range
303 //
304 // \tparam hits_iterator a hit_type iterator
305 //
306 template <typename fm_index_type>
307 template <typename hits_iterator>
309  const uint64 begin,
310  const uint64 end,
311  hits_iterator hits)
312 {
313 #if 0
314  const uint32 n_hits = end - begin;
315  const uint32 buffer_size = align<32>( n_hits );
316 
317  if (m_hits.size() < buffer_size * 2u)
318  {
319  m_hits.clear();
320  m_hits.resize( buffer_size * 2u );
321  }
322 
323  // fill the output hits with (SA,string-id) coordinates
325  thrust::make_counting_iterator<uint64>(0u) + begin,
326  thrust::make_counting_iterator<uint64>(0u) + end,
327  m_hits.begin(),
329  m_n_queries,
330  nvbio::plain_view( m_slots ),
331  nvbio::plain_view( m_ranges ) ) );
332 
333  // sort by the first 8 bits of the SA coordinates
334  uint64* raw_hits( (uint64*)nvbio::plain_view( m_hits ) );
335 
336  cuda::SortBuffers<uint64*> sort_buffers;
337  sort_buffers.keys[0] = raw_hits;
338  sort_buffers.keys[1] = raw_hits + buffer_size;
339 
340  cuda::SortEnactor sort_enactor;
341  sort_enactor.sort( n_hits, sort_buffers, 0u, 8u );
342 
343  const uint32 pairs_selector = sort_buffers.selector;
344  const uint32 ssa_selector = sort_buffers.selector ? 0u : 1u;
345 
346  // locate the SSA iterators
348  m_hits.begin() + buffer_size * pairs_selector,
349  m_hits.begin() + buffer_size * pairs_selector + n_hits,
350  m_hits.begin() + buffer_size * ssa_selector,
352 
353  // perform the final SSA lookup
355  m_hits.begin() + buffer_size * pairs_selector,
356  m_hits.begin() + buffer_size * pairs_selector + n_hits,
357  m_hits.begin() + buffer_size * ssa_selector,
358  m_hits.begin() + buffer_size * pairs_selector,
360 
361  // and sort back by string-id into final position
362  sort_enactor.sort( n_hits, sort_buffers, 32u, 64u );
363 
364  thrust::copy(
365  m_hits.begin() + buffer_size * sort_buffers.selector,
366  m_hits.begin() + buffer_size * sort_buffers.selector + n_hits,
367  device_iterator( hits ) );
368 #else
369  const uint32 n_hits = end - begin;
370 
371  if (m_hits.size() < n_hits)
372  {
373  m_hits.clear();
374  m_hits.resize( n_hits );
375  }
376 
377  // fill the output hits with (SA,string-id) coordinates
379  thrust::make_counting_iterator<uint64>(0u) + begin,
380  thrust::make_counting_iterator<uint64>(0u) + end,
381  device_iterator( hits ),
383  m_n_queries,
384  nvbio::plain_view( m_slots ),
385  nvbio::plain_view( m_ranges ) ) );
386 
387  // locate the SSA iterators
389  device_iterator( hits ),
390  device_iterator( hits ) + n_hits,
391  m_hits.begin(),
393 
394  // and perform the final SSA lookup
396  device_iterator( hits ),
397  device_iterator( hits ) + n_hits,
398  m_hits.begin(),
399  device_iterator( hits ),
401 #endif
402 }
403 
404 } // namespace nvbio