NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
sufsort_bucketing.h
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #pragma once
29 
30 #include <cub/cub.cuh>
31 #include <mgpuhost.cuh>
32 #include <moderngpu.cuh>
34 #include <nvbio/basic/timer.h>
35 #include <nvbio/basic/cuda/timer.h>
36 #include <nvbio/basic/cuda/sort.h>
38 #include <thrust/host_vector.h>
39 #include <thrust/device_vector.h>
40 #include <thrust/adjacent_difference.h>
41 #include <nvbio/basic/omp.h>
42 
43 namespace nvbio {
44 namespace priv {
45 
53 template <uint32 SYMBOL_SIZE, uint32 N_BITS, uint32 DOLLAR_BITS, typename bucket_type = uint32>
55 {
59  suffixes( _mgpu ),
60  d_setup_time(0.0f),
61  d_flatten_time(0.0f),
62  d_count_sort_time(0.0f),
63  d_collect_sort_time(0.0f),
64  d_filter_time(0.0f),
65  d_remap_time(0.0f),
66  d_max_time(0.0f),
67  d_search_time(0.0f),
68  d_copy_time(0.0f),
69  m_mgpu( _mgpu ) {}
70 
73  void clear_timers()
74  {
75  suffixes.clear_timers();
76 
77  d_setup_time = 0.0f;
78  d_flatten_time = 0.0f;
79  d_count_sort_time = 0.0f;
80  d_collect_sort_time = 0.0f;
81  d_filter_time = 0.0f;
82  d_remap_time = 0.0f;
83  d_max_time = 0.0f;
84  d_copy_time = 0.0f;
85  }
86 
90  template <typename string_set_type>
91  uint64 count(const string_set_type& string_set)
92  {
93  cuda::Timer timer;
94 
95  // initialize the flattener
96  suffixes.set( string_set, false ); // skip empty suffixes, whose position is trivial
97 
98  const uint32 n_suffixes = suffixes.n_suffixes;
99  const uint32 n_buckets = 1u << N_BITS;
100 
101  // initialize the temporary and output vectors
102  alloc_storage( d_indices, n_suffixes * 2u );
103  alloc_storage( d_radices, n_suffixes * 2u );
104  alloc_storage( d_buckets, n_buckets );
105 
106  timer.start();
107 
108  // extract the first radix word from each of the suffixes
109  suffixes.flatten(
110  string_set,
111  0u, // load the first word
112  Bits<N_BITS, DOLLAR_BITS>(), // number of bits per word
113  thrust::make_counting_iterator<uint32>(0u),
114  d_radices.begin() );
115 
116  timer.stop();
117  d_flatten_time += timer.seconds();
118 
119  timer.start();
120 
121  // sort the radices so as to make binning easy
122  cuda::SortBuffers<bucket_type*> sort_buffers;
123  cuda::SortEnactor sort_enactor;
124 
125  sort_buffers.selector = 0;
126  sort_buffers.keys[0] = nvbio::device_view( d_radices );
127  sort_buffers.keys[1] = nvbio::device_view( d_radices ) + n_suffixes;
128  sort_enactor.sort( n_suffixes, sort_buffers, 0u, N_BITS );
129 
130  timer.stop();
131  d_count_sort_time += timer.seconds();
132 
133  // initialize the bucket counters
134  thrust::fill( d_buckets.begin(), d_buckets.end(), bucket_type(0u) );
135 
136  // compute the number of effectively used buckets looking at the last non-empty one
137  const uint32 n_used_buckets = d_radices[ sort_buffers.selector * n_suffixes + n_suffixes-1 ] + 1u;
138 
139  timer.start();
140 
141  // find the end of each bin of values
142  mgpu::SortedSearch<mgpu::MgpuBoundsUpper>(
143  thrust::make_counting_iterator<uint32>(0u),
144  n_used_buckets,
145  d_radices.begin() + sort_buffers.selector * n_suffixes,
146  n_suffixes,
147  d_buckets.begin(),
148  *m_mgpu );
149 
150  // compute the histogram by taking differences of the cumulative histogram
151  thrust::adjacent_difference(
152  d_buckets.begin(), d_buckets.begin() + n_used_buckets,
153  d_buckets.begin());
154 
155  timer.stop();
156  d_search_time += timer.seconds();
157 
158  return suffixes.n_suffixes;
159  }
160 
164  template <typename string_set_type, typename bucketmap_iterator>
166  const string_set_type& string_set,
167  const uint32 bucket_begin,
168  const uint32 bucket_end,
169  const uint32 string_offset,
170  const bucketmap_iterator bucketmap)
171  {
172  cuda::Timer timer;
173 
174  timer.start();
175 
176  // initialize the flattener
177  suffixes.set( string_set, false ); // skip empty suffixes, whose position is trivial
178 
179  timer.stop();
180  d_setup_time += timer.seconds();
181 
182  const uint32 n_suffixes = suffixes.n_suffixes;
183  const uint32 n_buckets = 1u << N_BITS;
184 
185  // initialize the temporary and output vectors
186  alloc_storage( d_indices, n_suffixes * 2u );
187  alloc_storage( d_radices, n_suffixes * 2u );
188  alloc_storage( d_buckets, n_buckets );
189 
190  timer.start();
191 
192  // extract the first radix word from each of the suffixes
193  suffixes.flatten(
194  string_set,
195  0u, // load the first word
196  Bits<N_BITS, DOLLAR_BITS>(), // number of bits per word
197  thrust::make_counting_iterator<uint32>(0u),
198  d_radices.begin() );
199 
200  timer.stop();
201  d_flatten_time += timer.seconds();
202 
203  timer.start();
204 
205  // determine if a radix is in the given bucket range
206  const priv::in_range_functor in_range = priv::in_range_functor( bucket_begin, bucket_end );
207 
208  // retain only suffixes whose radix is between the specified buckets
210  n_suffixes,
211  thrust::make_zip_iterator( thrust::make_tuple( thrust::make_counting_iterator<uint32>(0u), d_radices.begin() ) ),
212  thrust::make_transform_iterator( d_radices.begin(), in_range ),
213  thrust::make_zip_iterator( thrust::make_tuple( d_indices.begin(), d_radices.begin() ) ) + n_suffixes,
214  suffixes.temp_storage );
215 
216  timer.stop();
217  d_filter_time += timer.seconds();
218 
219  timer.start();
220 
221  // remap the collected radices
222  thrust::gather(
223  d_radices.begin() + n_suffixes,
224  d_radices.begin() + n_suffixes + n_collected,
225  bucketmap,
226  d_radices.begin() + n_suffixes );
227 
228  timer.stop();
229  d_remap_time += timer.seconds();
230 
231  timer.start();
232 
233  // compute the maximum suffix length inside the range
234  max_suffix_len = suffixes.max_length(
235  string_set,
236  d_indices.begin() + n_suffixes,
237  d_indices.begin() + n_suffixes + n_collected );
238 
239  timer.stop();
240  d_max_time += timer.seconds();
241 
242  timer.start();
243 
244  // use a device-staging buffer to localize the indices
245  const uint32 buffer_stride = util::round_i( n_collected, 32u );
246  alloc_storage( d_output, buffer_stride * 2 );
247 
248  localize_suffix_functor localizer(
249  nvbio::device_view( suffixes.cum_lengths ),
250  nvbio::device_view( suffixes.string_ids ),
251  string_offset );
252 
254  d_indices.begin() + n_suffixes,
255  d_indices.begin() + n_suffixes + n_collected,
256  d_output.begin() + buffer_stride,
257  localizer );
258 
259  timer.stop();
260  d_copy_time += timer.seconds();
261 
262  timer.start();
263 
264  // sort the radices so as to make binning easy
266  cuda::SortEnactor sort_enactor;
267 
268  sort_buffers.selector = 0;
269  //#define SORT_BY_BUCKETS
270  #if defined(SORT_BY_BUCKETS)
271  sort_buffers.keys[0] = nvbio::device_view( d_radices ) + n_suffixes;
272  sort_buffers.keys[1] = nvbio::device_view( d_radices );
273  sort_buffers.values[0] = (uint64*)nvbio::device_view( d_output ) + buffer_stride;
274  sort_buffers.values[1] = (uint64*)nvbio::device_view( d_output );
275  sort_enactor.sort( n_collected, sort_buffers, 0u, N_BITS );
276  #endif
277 
278  timer.stop();
279  d_collect_sort_time += timer.seconds();
280 
281  //
282  // copy all the indices inside the range to the output
283  //
284 
285  alloc_storage( h_suffixes, n_suffixes );
286  alloc_storage( h_radices, n_suffixes );
287 
288  timer.start();
289 
290  // the buffer selector had inverted semantics
291  sort_buffers.selector = 1 - sort_buffers.selector;
292 
293  // and copy everything to the output
294  thrust::copy(
295  d_output.begin() + sort_buffers.selector * buffer_stride,
296  d_output.begin() + sort_buffers.selector * buffer_stride + n_collected,
297  h_suffixes.begin() );
298 
299  // and copy everything to the output
300  thrust::copy(
301  d_radices.begin() + sort_buffers.selector * n_suffixes,
302  d_radices.begin() + sort_buffers.selector * n_suffixes + n_collected,
303  h_radices.begin() );
304 
305  timer.stop();
306  d_copy_time += timer.seconds();
307 
308  return n_collected;
309  }
310 
314  {
315  return
316  suffixes.allocated_device_memory() +
317  d_indices.size() * sizeof(uint32) +
318  d_radices.size() * sizeof(bucket_type) +
319  d_buckets.size() * sizeof(uint32) +
320  d_output.size() * sizeof(uint32);
321  }
322 
326  {
327  return
328  //suffixes.allocated_host_memory() +
329  h_radices.size() * sizeof(bucket_type) +
330  h_suffixes.size() * sizeof(uint2);
331  }
332 
333 public:
335  thrust::device_vector<uint32> d_indices;
336  thrust::device_vector<bucket_type> d_radices;
337  thrust::device_vector<uint32> d_buckets;
338  thrust::device_vector<uint2> d_output;
339  thrust::host_vector<bucket_type> h_radices;
340  thrust::host_vector<uint2> h_suffixes;
343 
350  float d_max_time;
352  float d_copy_time;
354 };
355 
366 template <
367  uint32 SYMBOL_SIZE,
368  bool BIG_ENDIAN,
369  typename storage_type,
370  uint32 N_BITS,
371  uint32 DOLLAR_BITS,
372  typename bucket_type,
373  typename system_tag>
375 {
377  typedef typename chunk_loader_type::chunk_set_type chunk_set_type;
378 
382 
386  {
387  count_time = 0.0f;
388  load_time = 0.0f;
389  merge_time = 0.0f;
390 
391  collect_time = 0.0f;
392  bin_time = 0.0f;
393  }
394 
398  template <typename string_set_type>
400  const string_set_type& string_set,
401  thrust::host_vector<uint32>& h_buckets)
402  {
403  Timer count_timer;
404  count_timer.start();
405 
406  const uint32 chunk_size = 128*1024;
407  const uint32 n_strings = string_set.size();
408 
409  uint64 n_suffixes = 0u;
410 
411  const uint32 n_buckets = 1u << N_BITS;
412 
413  // initialize temporary vectors
414  alloc_storage( d_buckets, n_buckets );
415 
416  // initialize the bucket counters
417  thrust::fill( d_buckets.begin(), d_buckets.end(), bucket_type(0u) );
418 
419  // declare a chunk loader
420  chunk_loader_type chunk_loader;
421 
422  // split the string-set in batches
423  for (uint32 chunk_begin = 0; chunk_begin < n_strings; chunk_begin += chunk_size)
424  {
425  // determine the end of this batch
426  const uint32 chunk_end = nvbio::min( chunk_begin + chunk_size, n_strings );
427 
428  Timer timer;
429  timer.start();
430 
431  const chunk_set_type chunk_set = chunk_loader.load( string_set, chunk_begin, chunk_end );
432 
433  NVBIO_CUDA_DEBUG_STATEMENT( cudaDeviceSynchronize() );
434  timer.stop();
435  load_time += timer.seconds();
436 
437  n_suffixes += m_bucketer.count( chunk_set );
438 
439  timer.start();
440 
441  // and merge them in with the global buckets
443  m_bucketer.d_buckets.begin(),
444  m_bucketer.d_buckets.end(),
445  d_buckets.begin(),
446  d_buckets.begin(),
447  thrust::plus<uint32>() );
448 
449  NVBIO_CUDA_DEBUG_STATEMENT( cudaDeviceSynchronize() );
450  timer.stop();
451  merge_time += timer.seconds();
452  }
453 
454  // copy the results
455  thrust::copy( d_buckets.begin(), d_buckets.end(), h_buckets.begin() );
456 
457  //
458  // initialize internal bucketing helpers
459  //
460 
461  m_global_offset = 0u;
462 
463  // scan the bucket offsets so as to have global positions
464  alloc_storage( h_bucket_offsets, n_buckets );
468  h_bucket_offsets.begin() );
469 
470  count_timer.stop();
471  count_time += count_timer.seconds();
472 
473  return n_suffixes;
474  }
475 
479  template <typename string_set_type>
481  const string_set_type& string_set,
482  const uint32 bucket_begin,
483  const uint32 bucket_end,
484  uint32& max_suffix_len,
485  const thrust::host_vector<uint32>& h_subbuckets,
486  thrust::host_vector<uint2>& h_suffixes)
487  {
488  const uint32 chunk_size = 128*1024;
489  const uint32 n_strings = string_set.size();
490 
491  // copy the sub-buckets on the device
492  alloc_storage( d_subbuckets, uint32( h_subbuckets.size() ) );
493  thrust::copy( h_subbuckets.begin(), h_subbuckets.end(), d_subbuckets.begin() );
494 
495  // declare a chunk loader
496  chunk_loader_type chunk_loader;
497 
498  uint64 n_collected = 0u;
499 
500  // split the string-set in batches
501  for (uint32 chunk_begin = 0; chunk_begin < n_strings; chunk_begin += chunk_size)
502  {
503  // determine the end of this batch
504  const uint32 chunk_end = nvbio::min( chunk_begin + chunk_size, n_strings );
505 
506  Timer timer;
507  timer.start();
508 
509  const chunk_set_type chunk_set = chunk_loader.load( string_set, chunk_begin, chunk_end );
510 
511  NVBIO_CUDA_DEBUG_STATEMENT( cudaDeviceSynchronize() );
512  timer.stop();
513  load_time += timer.seconds();
514 
515  timer.start();
516 
517  m_bucketer.collect(
518  chunk_set,
519  bucket_begin,
520  bucket_end,
521  chunk_begin,
522  d_subbuckets.begin() );
523 
524  timer.stop();
525  collect_time += timer.seconds();
526 
527  // merge the collected suffixes
528  {
529  max_suffix_len = nvbio::max( max_suffix_len, m_bucketer.max_suffix_len );
530 
531  // check whether we are overflowing our output buffer
532  if (n_collected + m_bucketer.n_collected > h_suffixes.size())
533  {
534  log_error(stderr,"buffer size exceeded! (%llu/%llu)\n", n_collected + m_bucketer.n_collected, uint64( h_suffixes.size() ));
535  exit(1);
536  }
537 
538  Timer timer;
539  timer.start();
540 
541  // dispatch each suffix to their respective bucket
542  for (uint32 i = 0; i < m_bucketer.n_collected; ++i)
543  {
544  const uint2 loc = m_bucketer.h_suffixes[i];
545  const uint32 bucket = m_bucketer.h_radices[i];
546  const uint64 slot = h_bucket_offsets[bucket]++; // this could be done in parallel using atomics
547 
549  slot >= m_global_offset,
550  slot < m_global_offset + h_suffixes.size(),
551  "[%u] = (%u,%u) placed at %llu - %llu (%u)\n", i, loc.x, loc.y, slot, m_global_offset, bucket );
552 
553  h_suffixes[ slot - m_global_offset ] = loc;
554  }
555 
556  timer.stop();
557  bin_time += timer.seconds();
558  }
559 
560  n_collected += m_bucketer.n_collected;
561  }
562 
563  // keep track of how many suffixes we have collected globally
564  m_global_offset += n_collected;
565 
566  return n_collected;
567  }
568 
571  uint64 allocated_device_memory() const { return m_bucketer.allocated_device_memory(); }
572 
575  uint64 allocated_host_memory() const { return m_bucketer.allocated_host_memory(); }
576 
580  {
581  log_verbose(stderr," counting : %.1fs\n", count_time );
582  log_verbose(stderr," load : %.1fs\n", load_time);
583  log_verbose(stderr," merge : %.1fs\n", merge_time);
584  log_verbose(stderr," count : %.1fs\n", count_time - load_time - merge_time);
585  log_verbose(stderr," flatten : %.1fs\n", m_bucketer.d_flatten_time);
586  log_verbose(stderr," sort : %.1fs\n", m_bucketer.d_count_sort_time);
587  log_verbose(stderr," search : %.1fs\n", m_bucketer.d_search_time);
588  log_verbose(stderr," setup : %.1fs\n", m_bucketer.d_setup_time);
589  log_verbose(stderr," scan : %.1fs\n", m_bucketer.suffixes.d_scan_time);
590  log_verbose(stderr," search : %.1fs\n", m_bucketer.suffixes.d_search_time);
591  }
592 
596  {
597  log_verbose(stderr," load : %.1fs\n", load_time);
598  log_verbose(stderr," collect : %.1fs\n", collect_time);
599  log_verbose(stderr," setup : %.1fs\n", m_bucketer.d_setup_time);
600  log_verbose(stderr," scan : %.1fs\n", m_bucketer.suffixes.d_scan_time);
601  log_verbose(stderr," search : %.1fs\n", m_bucketer.suffixes.d_search_time);
602  log_verbose(stderr," flatten : %.1fs\n", m_bucketer.d_flatten_time);
603  log_verbose(stderr," filter : %.1fs\n", m_bucketer.d_filter_time);
604  log_verbose(stderr," remap : %.1fs\n", m_bucketer.d_remap_time);
605  log_verbose(stderr," max : %.1fs\n", m_bucketer.d_max_time);
606  log_verbose(stderr," sort : %.1fs\n", m_bucketer.d_collect_sort_time);
607  log_verbose(stderr," copy : %.1fs\n", m_bucketer.d_copy_time);
608  log_verbose(stderr," bin : %.1fs\n", bin_time);
609  }
610 
611 public:
613  thrust::device_vector<uint32> d_buckets;
614  thrust::device_vector<uint32> d_subbuckets;
615  thrust::host_vector<uint64> h_bucket_offsets;
617 
618  float load_time;
619  float count_time;
620  float merge_time;
621 
623  float bin_time;
624 };
625 
628 template <uint32 SYMBOL_SIZE, uint32 N_BITS, uint32 DOLLAR_BITS, typename bucket_type = uint32>
630 {
634 
637  void clear_timers() {}
638 
642  void count_init()
643  {
644  const uint32 n_buckets = 1u << N_BITS;
645 
646  // initialize the temporary and output vectors
647  alloc_storage( h_buckets, n_buckets );
648 
649  // initialize the bucket counters
650  for (uint32 i = 0; i < n_buckets; ++i)
651  h_buckets[i] = 0u;
652 
653  // initialize the global suffix counter
654  n_suffixes = 0u;
655  }
656 
660  template <typename string_set_type>
661  uint64 count(const string_set_type& string_set)
662  {
663  typedef typename string_set_type::string_type string_type;
664 
665  // extract the first word
667 
668  const uint32 n_strings = string_set.size();
669 
670  uint64 _n_suffixes = 0u;
671 
672  // loop through all the strings in the set
673  for (uint32 i = 0; i < n_strings; ++i)
674  {
675  const string_type string = string_set[i];
676  const uint32 string_len = nvbio::length( string );
677 
678  // loop through all suffixes
679  for (uint32 j = 0; j < string_len; ++j)
680  {
681  // extract the first word of the j-th suffix
682  const bucket_type radix = word_functor( make_uint2( j, i ) );
683 
684  // and count it
685  ++h_buckets[ radix ];
686  }
687 
688  // increase total number of suffixes
689  _n_suffixes += string_len;
690  }
691 
692  // update the global counter
693  n_suffixes += _n_suffixes;
694 
695  return _n_suffixes;
696  }
697 
701  template <typename string_set_type, typename bucketmap_iterator>
703  const string_set_type& string_set,
704  const uint32 bucket_begin,
705  const uint32 bucket_end,
706  const uint32 string_offset,
707  const bucketmap_iterator bucketmap)
708  {
709  typedef typename string_set_type::string_type string_type;
710 
711  // extract the first word
713 
714  const uint32 n_strings = string_set.size();
715 
716  // keep track of the maximum suffix length
717  max_suffix_len = 0u;
718 
719  // keep track of the number of collected suffixes
720  n_collected = 0u;
721 
722  // loop through all the strings in the set
723  for (uint32 i = 0; i < n_strings; ++i)
724  {
725  const string_type string = string_set[i];
726  const uint32 string_len = nvbio::length( string );
727 
728  // loop through all suffixes
729  for (uint32 j = 0; j < string_len; ++j)
730  {
731  // extract the first word of the j-th suffix
732  const bucket_type radix = word_functor( make_uint2( j, i ) );
733 
734  // check whether the extracted radix is in the given bucket
735  if (radix >= bucket_begin && radix < bucket_end)
736  {
737  if (n_collected + 1u > h_radices.size())
738  {
739  h_radices.resize( (n_collected + 1u) * 2u );
740  h_suffixes.resize( (n_collected + 1u) * 2u );
741  }
742 
743  // remap the radix
744  h_radices[ n_collected ] = bucketmap[ radix ];
745  h_suffixes[ n_collected ] = make_uint2( j, i + string_offset );
746 
747  // advance the output index
748  n_collected++;
749  }
750  }
751 
752  // update the maximum suffix length
753  max_suffix_len = nvbio::max( max_suffix_len, string_len );
754  }
755  return n_collected;
756  }
757 
760  uint64 allocated_device_memory() const { return 0u; }
761 
765  {
766  return
767  h_buckets.size() * sizeof(uint32) +
768  h_radices.size() * sizeof(bucket_type) +
769  h_suffixes.size() * sizeof(uint2);
770  }
771 
772 public:
773  thrust::host_vector<uint32> h_buckets;
774  thrust::host_vector<bucket_type> h_radices;
775  thrust::host_vector<uint2> h_suffixes;
779 };
780 
790 template <
791  uint32 SYMBOL_SIZE,
792  bool BIG_ENDIAN,
793  typename storage_type,
794  uint32 N_BITS,
795  uint32 DOLLAR_BITS,
796  typename bucket_type>
798 {
800  typedef typename chunk_loader_type::chunk_set_type chunk_set_type;
801 
805  {
806  m_bucketers.resize( omp_get_num_procs() );
807  }
808 
812  {
813  collect_time = 0.0f;
814  merge_time = 0.0f;
815  bin_time = 0.0f;
816  }
817 
821  template <typename string_set_type>
823  const string_set_type& string_set,
824  thrust::host_vector<uint32>& h_buckets)
825  {
826  ScopedTimer<float> count_timer( &count_time );
827 
828  const uint32 n_threads = omp_get_max_threads();
829 
830  const uint32 n_buckets = 1u << N_BITS;
831  const uint32 chunk_size = 128*1024;
832  const uint32 batch_size = n_threads * chunk_size;
833  const uint32 n_strings = string_set.size();
834 
835  // initialize bucket counters
836  #pragma omp parallel for
837  for (int b = 0; b < int(n_buckets); ++b)
838  h_buckets[b] = 0u;
839 
840  // declare a chunk loader
841  chunk_loader_type chunk_loader;
842 
843  // keep track of the total number of suffixes
844  uint64 n_suffixes = 0u;
845 
846  // initialize bucket counters
847  for (uint32 i = 0; i < n_threads; ++i)
848  m_bucketers[i].count_init();
849 
850  // split the string-set in batches
851  for (uint32 batch_begin = 0; batch_begin < n_strings; batch_begin += batch_size)
852  {
853  // determine the end of this batch
854  const uint32 batch_end = nvbio::min( batch_begin + batch_size, n_strings );
855 
856  // split the batch in chunks, one per core
857  #pragma omp parallel
858  {
859  const uint32 tid = omp_get_thread_num();
860  const uint32 chunk_begin = batch_begin + tid * chunk_size;
861 
862  const uint32 chunk_end = nvbio::min( chunk_begin + chunk_size, batch_end );
863 
864  // make sure this thread is active
865  if (chunk_begin < batch_end)
866  {
867  const chunk_set_type chunk_set = chunk_loader.load( string_set, chunk_begin, chunk_end );
868 
869  m_bucketers[tid].count( chunk_set );
870  }
871  }
872  }
873 
874  // merge all buckets
875  {
876  ScopedTimer<float> timer( &merge_time );
877 
878  for (uint32 i = 0; i < n_threads; ++i)
879  {
880  // sum the number of suffixes
881  n_suffixes += m_bucketers[i].n_suffixes;
882 
883  #pragma omp parallel for
884  for (int b = 0; b < int(n_buckets); ++b)
885  h_buckets[b] += m_bucketers[i].h_buckets[b];
886  }
887  }
888 
889  //
890  // initialize internal bucketing helpers
891  //
892 
893  m_global_offset = 0u;
894 
895  // scan the bucket offsets so as to have global positions
896  alloc_storage( h_bucket_offsets, n_buckets );
900  h_bucket_offsets.begin() );
901 
902  return n_suffixes;
903  }
904 
908  template <typename string_set_type>
910  const string_set_type& string_set,
911  const uint32 bucket_begin,
912  const uint32 bucket_end,
913  uint32& max_suffix_len,
914  const thrust::host_vector<uint32>& h_subbuckets,
915  thrust::host_vector<uint2>& h_suffixes)
916  {
917  const uint32 batch_size = 1024*1024;
918  const uint32 n_strings = string_set.size();
919 
920  const uint32 n_threads = omp_get_max_threads();
921 
922  const uint32 chunk_size = util::divide_ri( batch_size, n_threads );
923 
924  // declare a chunk loader
925  chunk_loader_type chunk_loader;
926 
927  uint64 n_collected = 0u;
928 
929  // split the string-set in batches
930  for (uint32 batch_begin = 0; batch_begin < n_strings; batch_begin += batch_size)
931  {
932  // determine the end of this batch
933  const uint32 batch_end = nvbio::min( batch_begin + batch_size, n_strings );
934 
935  // perform bucketing for all the suffixes in the current batch
936  {
937  // time collection
939 
940  // split the batch in chunks, one per core
941  #pragma omp parallel
942  {
943  const uint32 tid = omp_get_thread_num();
944  const uint32 chunk_begin = batch_begin + tid * chunk_size;
945 
946  const uint32 chunk_end = nvbio::min( chunk_begin + chunk_size, batch_end );
947 
948  // make sure this thread is active
949  if (chunk_begin < batch_end)
950  {
951  const chunk_set_type chunk_set = chunk_loader.load( string_set, chunk_begin, chunk_end );
952 
953  m_bucketers[tid].collect(
954  chunk_set,
955  bucket_begin,
956  bucket_end,
957  chunk_begin,
958  h_subbuckets.begin() );
959  }
960  }
961  }
962 
963  max_suffix_len = 0u;
964 
965  // merge all output vectors, binning the output suffixes
966  for (uint32 i = 0; i < n_threads; ++i)
967  {
968  const uint32 chunk_begin = batch_begin + i * chunk_size;
969 
970  // make sure this thread is active
971  if (chunk_begin < batch_end)
972  {
973  // time binning
974  ScopedTimer<float> timer( &bin_time );
975 
976  // keep stats on the maximum suffix length
977  max_suffix_len = nvbio::max( max_suffix_len, m_bucketers[i].max_suffix_len );
978 
979  // check whether we are overflowing our output buffer
980  if (n_collected + m_bucketers[i].n_collected > h_suffixes.size())
981  {
982  log_error(stderr,"buffer size exceeded! (%llu/%llu)\n", n_collected + m_bucketers[i].n_collected, uint64( h_suffixes.size() ));
983  exit(1);
984  }
985 
986  // dispatch each suffix to their respective bucket
987  for (uint32 j = 0; j < m_bucketers[i].n_collected; ++j)
988  {
989  const uint2 loc = m_bucketers[i].h_suffixes[j];
990  const uint32 bucket = m_bucketers[i].h_radices[j];
991  const uint64 slot = h_bucket_offsets[bucket]++; // this could be done in parallel using atomics
992 
994  slot >= m_global_offset,
995  slot < m_global_offset + h_suffixes.size(),
996  "[%u:%u] = (%u,%u) placed at %llu - %llu (%u)\n", i, j, loc.x, loc.y, slot, m_global_offset, bucket );
997 
998  h_suffixes[ slot - m_global_offset ] = loc;
999  }
1000 
1001  // keep stats
1002  n_collected += m_bucketers[i].n_collected;
1003  }
1004  }
1005  }
1006 
1007  // keep track of how many suffixes we have collected globally
1008  m_global_offset += n_collected;
1009 
1010  return n_collected;
1011  }
1012 
1015  uint64 allocated_device_memory() const { return 0u; }
1016 
1020  {
1021  uint64 r = 0u;
1022  for (uint32 i = 0; i < m_bucketers.size(); ++i)
1024  return r;
1025  }
1026 
1030  {
1031  log_verbose(stderr," count : %.1fs\n", count_time);
1032  log_verbose(stderr," count : %.1fs\n", count_time - merge_time);
1033  log_verbose(stderr," merge : %.1fs\n", merge_time);
1034  }
1035 
1039  {
1040  log_verbose(stderr," collect : %.1fs\n", collect_time);
1041  log_verbose(stderr," bin : %.1fs\n", bin_time);
1042  }
1043 
1044 public:
1045  std::vector< HostCoreSetSuffixBucketer<SYMBOL_SIZE,N_BITS,DOLLAR_BITS,bucket_type> > m_bucketers;
1047  thrust::host_vector<uint64> h_bucket_offsets;
1048 
1049  float count_time;
1050  float merge_time;
1051 
1053  float bin_time;
1054 };
1055 
1056 } // namespace priv
1057 } // namespace nvbio