NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
ssa_inl.h
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 namespace nvbio {
29 
30 // constructor
31 //
33  const uint32 n,
34  const int32* sa,
35  const uint32 K)
36 {
37  m_n = n;
38 
39  const uint32 n_words = (n+1+31) >> 5;
40  const uint32 n_blocks = (n+1+63) >> 6;
41 
42  m_bitmask.resize( n_words );
43  for (uint32 i = 0; i < n_words; ++i)
44  m_bitmask[i] = 0u;
45 
46  m_blocks.resize( n_blocks );
47 
48  // count how many items we need to store
49  m_stored = 0;
50  for (uint32 i = 0; i <= n; ++i)
51  {
52  // store a block counter every 64 elements
53  if ((i & 63) == 0)
54  m_blocks[ i >> 6 ] = m_stored;
55 
56  // check whether this SA value is a multiple of K
57  if ((sa[i] & (K-1)) == 0)
58  {
59  m_stored++;
60  m_bitmask[ i >> 5 ] |= (1u << (i&31u));
61  }
62  }
63 
64  m_ssa.resize( m_stored );
65 
66  // store all the needed values
67  m_stored = 0;
68  for (uint32 i = 0; i <= n; ++i)
69  {
70  if ((sa[i] & (K-1)) == 0)
71  m_ssa[ m_stored++ ] = sa[i];
72  }
73 }
74 
75 // constructor
76 //
77 // \param fmi FM index
78 // \param K compression factor
79 template <typename FMIndexType>
81  const FMIndexType& fmi,
82  const uint32 K)
83 {
84  const uint32 n = fmi.length();
85 
86  m_n = n;
87 
88  const uint32 n_words = (n+1+31) >> 5;
89  const uint32 n_blocks = (n+1+63) >> 6;
90 
91  m_bitmask.resize( n_words );
92  for (uint32 i = 0; i < n_words; ++i)
93  m_bitmask[i] = 0u;
94 
95  m_blocks.resize( n_blocks );
96 
97  // count how many items we need to store
98  m_stored = 0;
99  {
100  // calculate SA value
101  uint32 isa = 0, sa = n;
102 
103  for (uint32 i = 0; i < n; ++i)
104  {
105  if ((sa & (K-1)) == 0)
106  {
107  m_stored++;
108  m_bitmask[ isa >> 5 ] |= (1u << (isa & 31u));
109  }
110 
111  --sa;
112 
113  isa = basic_inv_psi( fmi, isa );
114  }
115  if ((sa & (K-1)) == 0)
116  {
117  m_stored++;
118  m_bitmask[ isa >> 5 ] |= (1u << (isa & 31u));
119  }
120  }
121 
122  // compute the block counters, 1 every 64 elements
123  m_blocks[0] = 0;
124  for (uint32 i = 1; i < n_blocks; ++i)
125  {
126  m_blocks[i] = m_blocks[i-1] +
127  popc( m_bitmask[i*2-1] ) +
128  popc( m_bitmask[i*2-2] );
129  }
130 
131  m_ssa.resize( m_stored );
132 
133  // store all the needed values
134  {
135  // calculate SA value
136  uint32 isa = 0, sa = n;
137 
138  for (uint32 i = 0; i < n; ++i)
139  {
140  if ((sa & (K-1)) == 0)
141  m_ssa[ index( isa ) ] = sa;
142 
143  --sa;
144 
145  isa = basic_inv_psi( fmi, isa );
146  }
147  if ((sa & (K-1)) == 0)
148  m_ssa[ index( isa ) ] = sa;
149  }
150 
151  // NOTE: do we need to handle the fact we don't have sa[0] = -1?
152  if (m_ssa[0] == n)
153  m_ssa[0] = uint32(-1);
154 }
155 
156 inline uint32 SSA_value_multiple::index(const uint32 i) const
157 {
158  // check whether the value is present
159  const uint32 word = i >> 5;
160  const uint32 word_mask = m_bitmask[ word ];
161 
162  //
163  // compute the location of the value in the sampled suffix array
164  //
165 
166  // get the base block value
167  const uint32 block = i >> 6;
168  uint32 pos = m_blocks[ block ];
169 
170  // add the number of bits set in the bitmask containing 'i'
171  const uint32 shifted_mask = word_mask & ((1u << (i&31u)) - 1u);
172  pos += popc( shifted_mask );
173 
174  // add the number of bits set in all the preceding bitmasks
175  for (uint32 j = block*2; j < word; ++j)
176  pos += popc( m_bitmask[j] );
177 
178  return pos;
179 }
180 
181 // constructor
182 //
184  m_n( ssa.m_n ),
185  m_stored( ssa.m_stored )
186 {
187  if (m_n != 0)
188  {
189  const uint32 n_words = (m_n+31) >> 5;
190  const uint32 n_blocks = (m_n+63) >> 6;
191 
192  cudaMalloc( &m_ssa, sizeof(uint32)*ssa.m_stored );
193  cudaMalloc( &m_bitmask, sizeof(uint32)*n_words );
194  cudaMalloc( &m_blocks, sizeof(uint32)*n_blocks );
195 
196  cudaMemcpy( m_ssa, &ssa.m_ssa[0], sizeof(uint32)*m_stored, cudaMemcpyHostToDevice );
197  cudaMemcpy( m_bitmask, &ssa.m_bitmask[0], sizeof(uint32)*n_words, cudaMemcpyHostToDevice );
198  cudaMemcpy( m_blocks, &ssa.m_blocks[0], sizeof(uint32)*n_blocks, cudaMemcpyHostToDevice );
199  }
200  else
201  {
202  m_ssa = NULL;
203  m_bitmask = NULL;
204  m_blocks = NULL;
205  }
206 }
207 
208 // destructor
209 //
211 {
212  cudaFree( m_ssa );
213  cudaFree( m_bitmask );
214  cudaFree( m_blocks );
215 }
216 
217 // fetch the i-th value, if stored, return false otherwise.
218 //
219 template <typename SSAIterator, typename BitmaskIterator, typename BlockIterator>
221 {
222  // check whether the value is present
223  const uint32 word = i >> 5;
224  const uint32 word_mask = m_bitmask[ word ];
225  const uint32 is_there = word_mask & (1u << (i&31u));
226  if (is_there == 0)
227  return false;
228 
229  //
230  // compute the location of the value in the sampled suffix array
231  //
232 
233  // get the base block value
234  const uint32 block = i >> 6;
235  uint32 pos = m_blocks[ block ];
236 
237  // add the number of bits set in the bitmask containing 'i'
238  const uint32 shifted_mask = word_mask & ((1u << (i&31u)) - 1u);
239  pos += popc( shifted_mask );
240 
241  // add the number of bits set in all the preceding bitmasks
242  for (uint32 j = block*2; j < word; ++j)
243  pos += popc( m_bitmask[j] );
244 
245  r = m_ssa[ pos ];
246  return true;
247 }
248 
249 // fetch the i-th value, if stored, return false otherwise.
250 //
251 template <typename SSAIterator, typename BitmaskIterator, typename BlockIterator>
253 {
254  // check whether the value is present
255  const uint32 word = i >> 5;
256  const uint32 word_mask = m_bitmask[ word ];
257  const uint32 is_there = word_mask & (1u << (i&31u));
258  return is_there;
259 }
260 
261 // constructor
262 //
263 template <uint32 K, typename index_type>
265  const index_type n,
266  const index_type* sa)
267 {
268  const uint32 n_items = (n+1+K-1) / K;
269 
270  m_n = n;
271  m_ssa.resize( n_items );
272 
273  // store all the needed values
274  for (uint32 i = 0; i < n_items; ++i)
275  m_ssa[i] = sa[i*K];
276 }
277 
278 // constructor
279 //
280 // \param fmi FM index
281 // \param K compression factor
282 template <uint32 K, typename index_type>
283 template <typename FMIndexType>
285  const FMIndexType& fmi)
286 {
287  const uint32 n = fmi.length();
288  const uint32 n_items = (n+1+K-1) / K;
289 
290  m_n = n;
291  m_ssa.resize( n_items );
292 
293  // calculate SA value
294  index_type isa = 0, sa = n;
295 
296  for (index_type i = 0; i < n; ++i)
297  {
298  if ((isa & (K-1)) == 0)
299  m_ssa[ isa/K ] = sa;
300 
301  --sa;
302 
303  isa = basic_inv_psi( fmi, isa );
304  }
305  if ((isa & (K-1)) == 0)
306  m_ssa[ isa/K ] = sa;
307 
308  m_ssa[0] = index_type(-1); // before this line, ssa[0] = n
309 }
310 
311 // constructor
312 //
313 // \param ssa device-side ssa
314 template <uint32 K, typename index_type>
317 {
318  m_n = ssa.m_n;
319  m_ssa.resize( ssa.m_ssa.size() );
320 
321  const uint32 n_items = (m_n+1+K-1) / K;
322 
323  cudaMemcpy( &m_ssa[0], thrust::raw_pointer_cast(&ssa.m_ssa[0]), sizeof(index_type)*n_items, cudaMemcpyDeviceToHost );
324 }
325 
326 // copy operator
327 //
328 // \param ssa device-side ssa
329 template <uint32 K, typename index_type>
332 {
333  m_n = ssa.m_n;
334  m_ssa.resize( ssa.m_ssa.size() );
335 
336  const uint32 n_items = (m_n+1+K-1) / K;
337 
338  cudaMemcpy( &m_ssa[0], thrust::raw_pointer_cast(&ssa.m_ssa[0]), sizeof(index_type)*n_items, cudaMemcpyDeviceToHost );
339  return *this;
340 }
341 
342 // constructor
343 //
344 template <uint32 K, typename index_type>
346  m_n( ssa.m_n )
347 {
348  const uint32 n_items = (m_n+1+K-1) / K;
349 
350  m_ssa.resize( n_items );
351 
352  cudaMemcpy( thrust::raw_pointer_cast(&m_ssa[0]), &ssa.m_ssa[0], sizeof(index_type)*n_items, cudaMemcpyHostToDevice );
353 }
354 
355 #ifdef __CUDACC__
356 
357 template <uint32 K, typename index_type, typename FMIndexType>
358 __global__ void SSA_index_multiple_setup_kernel(
359  const uint32 range_begin,
360  const uint32 range_end,
361  const FMIndexType fmi,
362  index_type* ssa,
363  index_type* link)
364 {
365  const uint32 grid_size = blockDim.x * gridDim.x;
366  const uint32 thread_id = threadIdx.x + blockIdx.x * blockDim.x;
367 
368  for (uint32 idx = range_begin + thread_id; idx < range_end; idx += grid_size)
369  {
370  //
371  // Calculate the number of steps needed to go from the starting
372  // isa index to the next one, and store the link structure.
373  //
374  index_type isa = idx * K;
375  uint32 steps = 0;
376 
377  do
378  {
379  ++steps;
380 
381  isa = basic_inv_psi( fmi, isa );
382  NVBIO_CUDA_ASSERT( isa <= fmi.length() );
383  }
384  while ((isa & (K-1)) != 0);
385 
386  ssa[ isa/K ] = steps;
387  link[ idx ] = isa/K;
388  }
389 }
390 
394 template <uint32 K, typename index_type>
395 template <typename FMIndexType>
397  const FMIndexType& fmi)
398 {
399  init( fmi );
400 }
401 
405 template <uint32 K, typename index_type>
406 template <typename FMIndexType>
407 void SSA_index_multiple_device<K,index_type>::init(
408  const FMIndexType& fmi)
409 {
410  m_n = fmi.length();
411  const uint32 n_items = (m_n+1+K-1) / K;
412 
413  m_ssa.resize( n_items );
414 
415  index_type* d_link;
416  cudaMalloc( &d_link, sizeof(index_type)*n_items );
417 
418  //
419  // Compute m_ssa and link: this kernel computes the number of steps
420  // taken to go from one isa to the next, and the link structure between
421  // them.
422  //
423 
424  const uint32 BLOCK_SIZE = 128;
425  const uint32 max_blocks = (uint32)nvbio::cuda::max_active_blocks(
426  SSA_index_multiple_setup_kernel<K,index_type,FMIndexType>, BLOCK_SIZE, 0);
427 
428  const uint32 BATCH_SIZE = 128*1024;
429  for (uint32 range_begin = 0; range_begin < n_items; range_begin += BATCH_SIZE)
430  {
431  const uint32 range_end = std::min( range_begin + BATCH_SIZE, uint32( m_n / K + 1 ) );
432  const uint32 n_blocks = std::min( max_blocks, (range_end - range_begin + BLOCK_SIZE-1) / BLOCK_SIZE );
433 
434  SSA_index_multiple_setup_kernel<K> <<<n_blocks,BLOCK_SIZE>>>(
435  range_begin,
436  range_end,
437  fmi,
438  thrust::raw_pointer_cast(&m_ssa[0]),
439  d_link );
440 
441  cudaThreadSynchronize();
442  }
443  //cudaThreadSynchronize();
444 
445  // copy ssa & link to the host
446  std::vector<index_type> h_link( n_items );
447  std::vector<index_type> h_ssa( n_items );
448 
449  cudaMemcpy( &h_ssa[0], thrust::raw_pointer_cast(&m_ssa[0]), sizeof(index_type)*n_items, cudaMemcpyDeviceToHost );
450  cudaMemcpy( &h_link[0], d_link, sizeof(index_type)*n_items, cudaMemcpyDeviceToHost );
451 
452  //
453  // Walk the link structure between the isa's, and do what is essentially
454  // a prefix-sum of the associated number of steps on the way to compute
455  // the corresponding sa indices.
456  // As the order with which the counters are summed is specified by the
457  // link structure, it would be hard to parallelize this (might be possible
458  // with parallel list-ranking, but it's likely overkill).
459  //
460  typedef typename signed_type<index_type>::type sindex_type;
461 
462  index_type isa_div_k = 0;
463  sindex_type sa = m_n;
464  while (sa > 0)
465  {
466  if (isa_div_k >= n_items)
467  throw std::runtime_error("SSA_index_multiple_device: index out of bounds\n");
468 
469  isa_div_k = h_link[ isa_div_k ];
470  sa -= h_ssa[ isa_div_k ];
471 
472  h_ssa[ isa_div_k ] = index_type( sa );
473  }
474  h_ssa[0] = index_type(-1);
475 
476  // copy ssa to the device
477  cudaMemcpy( thrust::raw_pointer_cast(&m_ssa[0]), &h_ssa[0], sizeof(index_type)*n_items, cudaMemcpyHostToDevice );
478 
479  cudaFree( d_link );
480 }
481 
482 #endif
483 
484 // fetch the i-th value, if stored, return false otherwise.
485 //
486 template <uint32 K, typename Iterator>
488 {
489  if ((i & (K-1)) == 0)
490  {
491  r = m_ssa[ i / K ];
492  return true;
493  }
494  else
495  return false;
496 }
497 
498 // fetch the i-th value, if stored, return false otherwise.
499 //
500 template <uint32 K, typename Iterator>
502 {
503  return ((i & (K-1)) == 0);
504 }
505 
506 } // namespace nvbio