NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
bwte_inl.h
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #pragma once
29 
30 #include <cub/cub.cuh>
31 #include <mgpuhost.cuh>
32 #include <moderngpu.cuh>
37 #include <nvbio/basic/cuda/arch.h>
38 #include <nvbio/basic/cuda/sort.h>
39 #include <nvbio/basic/vector.h>
40 #include <nvbio/basic/primitives.h>
41 #include <nvbio/strings/suffix.h>
42 #include <thrust/merge.h>
43 #include <algorithm>
44 
45 namespace nvbio {
46 
47 inline
48 void BWTEBlock::reserve(const uint32 _max_block_strings, const uint32 _max_block_suffixes)
49 {
50  priv::alloc_storage( h_dollar_off, _max_block_strings );
51  priv::alloc_storage( h_dollar_id, _max_block_strings );
52  priv::alloc_storage( h_dollar_pos, _max_block_strings );
53 
54  priv::alloc_storage( h_SA, _max_block_suffixes );
55  priv::alloc_storage( h_BWT, _max_block_suffixes );
56  priv::alloc_storage( h_cum_lengths, _max_block_strings );
57 
58  // pin all the host memory used in device <-> host copies
59  if (max_block_suffixes < _max_block_suffixes) cudaHostRegister( &h_SA[0], _max_block_suffixes * sizeof(uint32), cudaHostRegisterPortable );
60  if (max_block_strings < _max_block_strings) cudaHostRegister( &h_cum_lengths[0], _max_block_strings * sizeof(uint32), cudaHostRegisterPortable );
61  if (max_block_strings < _max_block_strings) cudaHostRegister( &h_dollar_off[0], _max_block_strings * sizeof(uint32), cudaHostRegisterPortable );
62  if (max_block_strings < _max_block_strings) cudaHostRegister( &h_dollar_id[0], _max_block_strings * sizeof(uint32), cudaHostRegisterPortable );
63 
64  #if defined(QUICK_CHECK_REPORT) || defined(CHECK_SORTING)
65  priv::alloc_storage( h_string_ids, _max_block_suffixes );
66  #endif
67 
68  max_block_suffixes = _max_block_suffixes;
69  max_block_strings = _max_block_strings;
70 }
71 
74 template <uint32 SYMBOL_SIZE, bool BIG_ENDIAN, typename storage_type, typename offsets_iterator>
76  mgpu_ctxt( mgpu::CreateCudaDevice( device ) ),
77  string_sorter( mgpu_ctxt ),
78  suffixes( mgpu_ctxt )
79 {
80  load_time = 0.0f;
81  sort_time = 0.0f;
82  copy_time = 0.0f;
83  rank_time = 0.0f;
84  insert_time = 0.0f;
85  insert_dollars_time = 0.0f;
86 
87  n_strings_ext = 0u;
88  n_suffixes_ext = 0u;
89 
90  n_processed_strings = 0u;
91  n_processed_suffixes = 0u;
92 }
93 
94 // needed device memory
95 //
96 template <uint32 SYMBOL_SIZE, bool BIG_ENDIAN, typename storage_type, typename offsets_iterator>
98 {
99  const size_t d_bytes =
100  string_sorter.needed_device_memory( _max_block_suffixes )
101  + string_set_handler.needed_device_memory( max_block_suffixes )
102  + suffixes.needed_device_memory( _max_block_strings, _max_block_suffixes )
103  + chunk_loader.needed_device_memory( _max_block_strings, _max_block_suffixes )
104  + _max_block_suffixes * sizeof(uint2) // d_suffixes
105  + _max_block_suffixes * sizeof(uint8) // d_temp_storage
106  + _max_block_suffixes * sizeof(uint8) // d_BWT_block
107  + _max_block_strings * sizeof(uint32) // d_dollar_off
108  + _max_block_strings * sizeof(uint32); // d_dollar_id
109 
110  return d_bytes;
111 }
112 
113 // reserve space for a maximum block size
114 //
115 template <uint32 SYMBOL_SIZE, bool BIG_ENDIAN, typename storage_type, typename offsets_iterator>
116 void BWTEContext<SYMBOL_SIZE,BIG_ENDIAN,storage_type,offsets_iterator>::reserve(const uint32 _max_block_strings, const uint32 _max_block_suffixes)
117 {
118  max_block_suffixes = _max_block_suffixes;
119  max_block_strings = _max_block_strings;
120 
121  const size_t d_bytes =
122  string_sorter.needed_device_memory( max_block_suffixes )
123  + string_set_handler.needed_device_memory( max_block_suffixes )
124  + suffixes.needed_device_memory( max_block_strings, max_block_suffixes )
125  + chunk_loader.needed_device_memory( _max_block_strings, _max_block_suffixes )
126  + max_block_suffixes * sizeof(uint2) // d_suffixes
127  + max_block_suffixes * sizeof(uint8) // d_temp_storage
128  + max_block_suffixes * sizeof(uint8) // d_BWT_block
129  + max_block_strings * sizeof(uint32) // d_dollar_off
130  + max_block_strings * sizeof(uint32); // d_dollar_id
131 
132  log_verbose(stderr, " allocating device sorting storage (%.1f GB)\n",
133  float( d_bytes ) / float(1024*1024*1024) );
134 
135  priv::alloc_storage( d_suffixes, max_block_suffixes );
136  priv::alloc_storage( d_temp_storage, max_block_suffixes );
137 
138  priv::alloc_storage( d_BWT_block, max_block_suffixes );
139 
140  priv::alloc_storage( d_dollar_off, max_block_strings );
141  priv::alloc_storage( d_dollar_id, max_block_strings );
142 
143  chunk_loader.reserve( max_block_strings, max_block_suffixes );
144 
145  suffixes.reserve( max_block_strings, max_block_suffixes );
146 
147  string_set_handler.reserve( max_block_suffixes, SORTING_SLICE_SIZE );
148  string_sorter.reserve( max_block_suffixes );
149 
150  const size_t h_bytes =
151  max_block_suffixes * sizeof(uint64) * 2 + // g, g_sorted
152  max_block_suffixes * sizeof(uint32) + // h_SA
153  max_block_strings * sizeof(uint32) + // h_cum_lengths
154  max_block_suffixes * sizeof(uint8) + // h_BWT
155  max_block_strings * sizeof(uint32) + // h_dollar_off
156  max_block_strings * sizeof(uint32) + // h_dollar_id
157  max_block_strings * sizeof(uint64); // h_dollar_pos
158 
159  log_verbose(stderr, " allocating host sorting storage (%.1f GB)\n",
160  float( h_bytes ) / float(1024*1024*1024) );
161 
162  priv::alloc_storage( g, max_block_suffixes );
163  priv::alloc_storage( g_sorted, max_block_suffixes );
164 
165  block.reserve( max_block_strings, max_block_suffixes );
166 }
167 
168 // append a new block of strings
169 //
170 template <uint32 SYMBOL_SIZE, bool BIG_ENDIAN, typename storage_type, typename offsets_iterator>
172  const uint32 block_begin,
173  const uint32 block_end,
174  const string_set_type string_set,
176  SparseSymbolSet& BWT_ext_dollars,
177  const bool forward)
178 {
179  sort_block( block_begin, block_end, string_set, block );
180 
181  merge_block(
182  block_begin,
183  block_end,
184  string_set,
185  block,
186  BWT_ext,
187  BWT_ext_dollars,
188  forward );
189 }
190 
191 // merge the given sorted block
192 //
193 template <uint32 SYMBOL_SIZE, bool BIG_ENDIAN, typename storage_type, typename offsets_iterator>
195  const uint32 block_begin,
196  const uint32 block_end,
197  const string_set_type string_set,
198  BWTEBlock& block,
200  SparseSymbolSet& BWT_ext_dollars,
201  const bool forward)
202 {
203  n_suffixes_ext = BWT_ext.size();
204  n_strings_ext = BWT_ext_dollars.size();
205 
206  // merge the new BWT with the old one
207  if (n_suffixes_ext)
208  {
209  //
210  // Rank all the suffixes in the sorted block wrt BWT_ext.
211  //
212 
213  rank_block(
214  block_begin,
215  block_end,
216  string_set,
217  block,
218  BWT_ext,
219  BWT_ext_dollars,
220  forward );
221 
222  insert_block(
223  block,
224  BWT_ext,
225  BWT_ext_dollars );
226  }
227  else
228  {
229  // store the BWT block in BWT_ext
230  BWT_ext.resize( block.n_suffixes, &block.h_BWT[0] );
231 
232  // save the initial set of dollars
233  BWT_ext_dollars.set(
234  block.n_suffixes,
235  block.n_strings,
236  raw_pointer( block.h_dollar_off ),
237  raw_pointer( block.h_dollar_id ) );
238 
239  #if defined(CHECK_COPY)
240  log_visible(stderr, " check copy... ");
241  uint64 occ[SYMBOL_COUNT] = { 0 };
242  for (uint32 i = 0; i < block.n_suffixes; ++i)
243  {
244  if ((i % 1000) == 0)
245  log_visible(stderr, "\r check copy... %6.2f%% ", 100.0f * float(i)/float(block.n_suffixes));
246 
247  const uint32 cb = block.h_BWT[i] & (SYMBOL_COUNT-1);
248  const uint32 ce = BWT_ext[i];
249  if (ce != cb)
250  {
251  log_error(stderr, "at %u, expected %u, got %u!\n", i, cb, ce);
252  exit(1);
253  }
254 
255  ++occ[ ce ];
256  for (uint8 q = 0; q < SYMBOL_COUNT; ++q)
257  {
258  const uint64 r = BWT_ext.rank( i, q );
259  if (r != occ[q])
260  {
261  log_error(stderr, "ranks mismatch at c[%u], i[%llu]:p[%llu]: expected %llu, got %llu!\n",
262  uint32(q), i, occ[q], r );
263  exit(1);
264  }
265  }
266  }
267  log_visible(stderr, "\r check copy... \n");
268  #endif
269 
270  #if defined(CHECK_SORTING)
271  // build a suffix localizer
272  const priv::localize_suffix_functor suffix_localizer(
275  block_begin );
276 
277  for (uint32 i = 0; i < n_block_suffixes; ++i)
278  TSA[i] = suffix_localizer( block.h_SA[i] );
279  #endif
280  }
281 
282  // update counters
283  n_processed_strings += block.n_strings;
284  n_processed_suffixes += block.n_suffixes;
285 }
286 
287 // sort the given device block
288 //
289 template <uint32 SYMBOL_SIZE, bool BIG_ENDIAN, typename storage_type, typename offsets_iterator>
291  const uint32 block_begin,
292  const uint32 block_end,
293  const string_set_type string_set,
294  BWTEBlock& block)
295 {
296  block.reserve( max_block_strings, max_block_suffixes );
297 
298  const uint32 n_block_strings = block_end - block_begin;
299 
300  uint32 n_block_suffixes = 0u;
301  for (uint32 i = block_begin; i < block_end; ++i)
302  n_block_suffixes += nvbio::length( string_set[i] ) + 1u;
303 
304  block.n_strings = n_block_strings;
305  block.n_suffixes = n_block_suffixes;
306 
307  // load this block
308  log_debug(stderr, " load device chunk (%u)\n", n_block_suffixes);
309 
310  Timer timer;
311  timer.start();
312 
313  const chunk_set_type chunk_set = chunk_loader.load( string_set, block_begin, block_end );
314 
315  NVBIO_CUDA_DEBUG_STATEMENT( cudaDeviceSynchronize() );
316  timer.stop();
317  load_time += timer.seconds();
318 
319  // compute its SA and BWT
320  log_debug(stderr, " sort\n");
321  {
322  const uint32 SYMBOLS_PER_RADIXWORD = priv::symbols_per_word<SYMBOL_SIZE,RADIX_BITS,DOLLAR_BITS>();
323 
324  log_debug(stderr, " set suffix flattener\n");
325  {
326  cuda::ScopedTimer<float> timer( &sort_time );
327 
328  // prepare the suffix flattener
329  suffixes.set( chunk_set );
330 
331  // make sure we get the proper number of suffixes
332  if (suffixes.n_suffixes != n_block_suffixes)
333  {
334  log_error(stderr, " unexpected number of suffixes! (%u)\n", suffixes.n_suffixes);
335  exit(1);
336  }
337  }
338 
339  // copy the suffix flattener to the host
340  {
341  cuda::ScopedTimer<float> timer( &copy_time );
342 
343  #if defined(HOST_STRING_IDS)
344  thrust::copy(
345  suffixes.string_ids.begin(),
346  suffixes.string_ids.begin() + n_block_suffixes,
347  block.h_string_ids.begin() );
348  #endif
349 
350  thrust::copy(
351  suffixes.cum_lengths.begin(),
352  suffixes.cum_lengths.begin() + n_block_strings,
353  block.h_cum_lengths.begin() );
354  }
355 
356  log_debug(stderr, " localize suffixes\n");
357  {
358  cuda::ScopedTimer<float> timer( &sort_time );
359 
360  // build a suffix localizer
361  const priv::localize_suffix_functor suffix_localizer(
362  nvbio::raw_pointer( suffixes.cum_lengths ),
363  nvbio::raw_pointer( suffixes.string_ids ) );
364 
365  // build the list of suffixes
366  nvbio::transform<device_tag>(
367  n_block_suffixes,
368  thrust::make_counting_iterator<uint32>(0),
369  d_suffixes.begin(),
370  suffix_localizer );
371  }
372 
373  // reuse suffixes.string_ids to store the final indices
374  thrust::device_ptr<uint32> d_indices( nvbio::raw_pointer( suffixes.string_ids ) );
375 
376  // temporary BWT storage
377  thrust::device_ptr<uint8> d_unsorted_bwt = thrust::device_ptr<uint8>( raw_pointer( d_temp_storage ) );
378  thrust::device_ptr<uint8> d_bwt = thrust::device_ptr<uint8>( raw_pointer( d_BWT_block ) );
379 
380  log_debug(stderr, " compression sort\n");
381  {
382  cuda::ScopedTimer<float> timer( &sort_time );
383 
384  // compute the maximum number of words needed to represent a suffix
385  const uint32 n_words = util::divide_ri( suffixes.max_length( chunk_set ), SYMBOLS_PER_RADIXWORD );
386 
387  //
388  // sort the suffixes, building the SA
389  //
390 
391  // initialize the set radices
392  string_set_handler.set( chunk_set );
393  string_set_handler.init( n_block_suffixes, NULL, nvbio::raw_pointer( d_suffixes ) );
394 
395  cuda::DiscardDelayList delay_list;
396 
397  // sort the suffix strings!
398  string_sorter.sort(
399  string_set_handler,
400  n_block_suffixes,
401  n_words,
402  thrust::make_counting_iterator<uint32>(0u),
403  d_indices,
404  uint32(-1),
405  delay_list,
406  SORTING_SLICE_SIZE );
407 
408  //
409  // extract the BWT from the SA
410  //
411 
412  // copy the unsorted symbols corresponding to each suffix
413  nvbio::transform<device_tag>(
414  n_block_suffixes,
415  d_suffixes.begin(),
416  d_unsorted_bwt,
418 
419  // gather the symbols in sorted order
420  thrust::gather(
421  d_indices,
422  d_indices + n_block_suffixes,
423  d_unsorted_bwt,
424  d_bwt );
425  }
426 
427  // copy back results to the host
428  log_debug(stderr, " copy to host\n");
429  {
430  cuda::ScopedTimer<float> timer( &copy_time );
431 
432  // copy the SA
433  thrust::copy(
434  d_indices,
435  d_indices + n_block_suffixes,
436  block.h_SA.begin() );
437 
438  // copy the BWT
439  thrust::copy(
440  d_bwt,
441  d_bwt + n_block_suffixes,
442  block.h_BWT.begin() );
443 
444  // copy the dollar offsets and their string ids
445  const uint32 n_dollars = nvbio::copy_flagged(
446  n_block_suffixes,
447  thrust::make_zip_iterator( thrust::make_tuple(
448  thrust::make_counting_iterator<uint32>(0),
451  thrust::make_zip_iterator( thrust::make_tuple(
452  d_dollar_off.begin(),
453  d_dollar_id.begin() ) ),
454  d_temp_storage );
455 
456  if (n_dollars != n_block_strings)
457  {
458  log_error(stderr, "mismatching number of dollars! expected %u, got %u\n", n_block_strings, n_dollars);
459  exit(1);
460  }
461 
462  // copy the dollar offsets
463  thrust::copy(
464  d_dollar_off.begin(),
465  d_dollar_off.begin() + n_block_strings,
466  block.h_dollar_off.begin() );
467 
468  // copy the dollar ids
469  thrust::copy(
470  d_dollar_id.begin(),
471  d_dollar_id.begin() + n_block_strings,
472  block.h_dollar_id.begin() );
473  }
474  }
475 
476  log_verbose(stderr, " sort : %.1f M suffixes/s (sort: %.1f%%, copy: %.1f%%)\n",
477  (1.0e-6f * (n_processed_suffixes + n_block_suffixes)) / (sort_time + copy_time),
478  100.0f * sort_time / (sort_time + copy_time),
479  100.0f * copy_time / (sort_time + copy_time));
480 }
481 
482 // rank the block suffixes wrt BWT_ext
483 //
484 template <uint32 SYMBOL_SIZE, bool BIG_ENDIAN, typename storage_type, typename offsets_iterator>
486  const uint32 block_begin,
487  const uint32 block_end,
488  const string_set_type string_set,
489  const BWTEBlock& block,
491  SparseSymbolSet& BWT_ext_dollars,
492  const bool forward)
493 {
494  typedef typename string_set_type::string_type string_type;
495 
496  const uint32 n_block_strings = block.n_strings;
497  const uint32 n_block_suffixes = block.n_suffixes;
498 
499  //
500  // Rank all the suffixes in the sorted block wrt BWT_ext.
501  // Currently, we just do a parallel loop through all the strings in the block,
502  // and for each of them iterate through their suffixes starting from the shortest.
503  //
504  // An alternative would be to break the *sorted* suffixes in batches of equal
505  // length (i.e. first all the sorted 1-suffixes, then all the sorted 2-suffixes, etc),
506  // and do a series of parallel loops through them: this would ensure that the
507  // ranking operations perform coherent memory accesses.
508  //
509  // How do we select all the j-suffixes in sorted order from the SA?
510  // Is there any better way then doing a separate parallel select across all of them for each j?
511  // In a way, it's a multi-select, where a single pass could essentially do N-scans at
512  // the same time...
513  //
514 
515  Timer timer;
516  timer.start();
517 
518  // initialize the insertion array
519  //#pragma omp parallel for
520  //for (int i = 0; i < int( n_block_suffixes ); ++i)
521  // g[i] = 0u;
522 
523  log_debug(stderr, " rank\n");
524  log_debug(stderr, " C:\n");
525  uint64 C[SYMBOL_COUNT+1];
526  {
527  const uint64* freqs = BWT_ext.symbol_frequencies();
528  C[0] = n_strings_ext; // number of dollars
529  for (uint32 i = 1; i <= SYMBOL_COUNT; ++i)
530  {
531  C[i] = C[i-1] + freqs[i-1];
532 
533  // if this is the symbol used to represent dollars we have to remove them
534  if ((255u % SYMBOL_COUNT) == i-1)
535  C[i] -= n_strings_ext;
536 
537  log_debug(stderr, " %llu\n", C[i] );
538  }
539  }
540 
541  const uint32 DOLLAR_SYMBOL = 255u & (SYMBOL_COUNT-1);
542 
543  // loop through all the strings in this block
544  #pragma omp parallel for
545  for (int32 q = 0; q < int32( n_block_strings ); ++q)
546  {
547  const string_type string = string_set[block_begin + q];
548 
549  const uint32 len = nvbio::length( string );
550 
551  // start with the dollar sign: C[$_q] + rank($_q,i) reduces to C[$_q] = #dollars(block) = (backwards ? 0 : block_begin)
552  uint64 i = forward ? n_strings_ext : 0u;
553 
554  const uint32 suffix_off = q ? block.h_cum_lengths[ q-1 ] : 0u;
555 
556  g[suffix_off + len] = i;
557 
558  // loop backwards through all the suffixes of the current string
559  for (int32 k = len-1u; k >= 0; --k)
560  {
561  const uint8 c = string[k];
562 
563  const uint64 i_dollars = (c == DOLLAR_SYMBOL) ?
564  BWT_ext_dollars.rank(i-1) : 0u;
565 
566  // compute the number of external suffixes 'j' preceding this one
567  const uint64 r = C[c] + BWT_ext.rank( i-1, c );
568  const uint64 j = r - i_dollars;
569  assert( i_dollars <= r );
570  assert( j <= n_suffixes_ext );
571 
572  // save the insertion slot
573  g[suffix_off + k] = j;
574 
575  #if 0
576  if ((q == 243303 && k >= 56) ||
577  (q == 2163961 && k >= 56))
578  {
579  if (k == 99) log_verbose(stderr, "\n");
580  log_verbose(stderr, " q[%8u:%2u] : LF[%9llu,%u] = %9llu (r[%9llu], $[%9llu]\n",
581  q, k, i, uint32(c), j, r, i_dollars,
582  i_dollars,
583  i_dollars );
584  }
585  #endif
586 
587  i = j;
588  }
589  }
590 
591  // reorder g based on SA
592  #pragma omp parallel for
593  for (int32 i = 0; i < int32( n_block_suffixes ); ++i)
594  g_sorted[i] = g[block.h_SA[i]];
595 
596  timer.stop();
597  rank_time += timer.seconds();
598  log_verbose(stderr, " rank : %.1f M suffixes/s\n",
599  (1.0e-6f * (n_processed_suffixes + n_block_suffixes)) / rank_time);
600 
601  #if defined(CHECK_SORTING)
602  log_visible(stderr, " check sorting... ");
603 
604  // build a suffix localizer
605  const priv::localize_suffix_functor suffix_localizer(
608  block_begin );
609 
610  typedef Suffix<input_string_type,uint32> suffix_type;
611 
612  for (uint32 i = 0; i < n_block_suffixes; ++i)
613  {
614  if ((i % 1000) == 0)
615  log_visible(stderr, "\r check sorting... %6.2f%% ", 100.0f * float(i)/float(n_block_suffixes));
616 
617  const uint32 sa = block.h_SA[i];
618 
619  const uint2 suffix_i = suffix_localizer( sa );
620 
621  if (g_sorted[i] < n_suffixes_ext)
622  {
623  const uint2 suffix_n = TSA[ g_sorted[i] ];
624 
625  // make sure s_i <= s_n
626  const int32 cmp = compare_suffixes( string_set, suffix_i, suffix_n );
627 
628  if (cmp == 1)
629  {
630  log_error(stderr, "\ninsertion[%u -> %llu] : (%u,%u) > (%u,%u)\n",
631  i, g_sorted[i],
632  suffix_i.y, suffix_i.x,
633  suffix_n.y, suffix_n.x );
634 
635  const suffix_type string_i = make_suffix( string_set[suffix_i.y], suffix_i.x );
636  const suffix_type string_n = make_suffix( string_set[suffix_n.y], suffix_n.x );
637 
638  log_error(stderr, " I: \"");
639  for (uint32 j = 0; j < nvbio::length( string_i ); ++j)
640  {
641  const uint8 c_i = string_i[j];
642  log_error_cont(stderr, "%u", uint32(c_i) );
643  }
644  log_error_cont(stderr, "\"\n" );
645 
646  log_error(stderr, " N: \"");
647  for (uint32 j = 0; j < nvbio::length( string_n ); ++j)
648  {
649  const uint8 c_n = string_n[j];
650  log_error_cont(stderr, "%u", uint32(c_n) );
651  }
652  log_error_cont(stderr, "\"\n" );
653 
654  exit(1);
655  }
656  }
657  if (g_sorted[i] > 0)
658  {
659  const uint2 suffix_p = TSA[ g_sorted[i]-1 ];
660 
661  // make sure s_p < s_i
662  const int32 cmp = compare_suffixes( string_set, suffix_i, suffix_p );
663 
664  if (cmp == -1)
665  {
666  log_error(stderr, "\ninsertion[%u-1 -> %llu] : (%u,%u) < (%u, %u)\n",
667  i,
668  g_sorted[i],
669  suffix_i.y, suffix_i.x,
670  suffix_p.y, suffix_p.x );
671 
672  const suffix_type string_i = make_suffix( string_set[suffix_i.y], suffix_i.x );
673  const suffix_type string_p = make_suffix( string_set[suffix_p.y], suffix_p.x );
674 
675  log_error(stderr, " I: \"");
676  for (uint32 j = 0; j < nvbio::length( string_i ); ++j)
677  {
678  const uint8 c_i = string_i[j];
679  log_error_cont(stderr, "%u", uint32(c_i) );
680  }
681  log_error_cont(stderr, "\"\n" );
682 
683  log_error(stderr, " P: \"");
684  for (uint32 j = 0; j < nvbio::length( string_p ); ++j)
685  {
686  const uint8 c_p = string_p[j];
687  log_error_cont(stderr, "%u", uint32(c_p) );
688  }
689  log_error_cont(stderr, "\"\n" );
690 
691  exit(1);
692  }
693  }
694  }
695  log_visible(stderr, "\r check sorting... done \n");
696  exit(1);
697  #endif
698 
699  #if defined(QUICK_CHECK)
700  log_verbose(stderr, " check sorting\n");
701  // check that the suffix ranks are truly sorted
702  if (is_sorted<host_tag>( n_block_suffixes, &g_sorted[0] ) == false)
703  {
704  log_error(stderr, " BWTE: suffix ranks not in sorted order!\n");
705  #if defined(QUICK_CHECK_REPORT)
706  for (uint32 i = 0; i < n_block_suffixes-1; ++i)
707  {
708  if (g_sorted[i] > g_sorted[i+1])
709  {
710  log_error(stderr, " g_s[%u->%u] = %llu > g_s[%u->%u] = %llu\n",
711  i, block.h_SA[i], g_sorted[i],
712  i+1, block.h_SA[i+1], g_sorted[i+1] );
713 
714  // build a suffix localizer
715  const priv::localize_suffix_functor suffix_localizer(
717  nvbio::raw_pointer( block.h_string_ids ) );
718 
719  const uint2 suffix1 = suffix_localizer( block.h_SA[i] );
720  const uint2 suffix2 = suffix_localizer( block.h_SA[i+1] );
721 
722 
723  log_error(stderr, " (%02u,%8u): ", suffix1.x, suffix1.y);
724  {
725  const string_type string = string_set[block_begin + suffix1.y];
726  for (uint32 j = suffix1.x; j < nvbio::length( string ); ++j)
727  log_error_cont(stderr, "%u", uint32( string[j] ) );
728  log_error_cont(stderr, "\n");
729  }
730  log_error(stderr, " (%02u,%8u): ", suffix2.x, suffix2.y);
731  {
732  const string_type string = string_set[block_begin + suffix2.y];
733  for (uint32 j = suffix2.x; j < nvbio::length( string ); ++j)
734  log_error_cont(stderr, "%u", uint32( string[j] ) );
735  log_error_cont(stderr, "\n");
736  }
737  break;
738  }
739  }
740  #endif
741  exit(1);
742  }
743  #endif
744 }
745 
746 // insert the block
747 //
748 template <uint32 SYMBOL_SIZE, bool BIG_ENDIAN, typename storage_type, typename offsets_iterator>
749 void BWTEContext<SYMBOL_SIZE,BIG_ENDIAN,storage_type,offsets_iterator>::insert_block(
750  BWTEBlock& block,
751  PagedText<SYMBOL_SIZE,BIG_ENDIAN>& BWT_ext,
752  SparseSymbolSet& BWT_ext_dollars)
753 {
754  const uint32 n_block_strings = block.n_strings;
755  const uint32 n_block_suffixes = block.n_suffixes;
756 
757  #if defined(CHECK_INSERTION)
758  // make a copy of BWT_ext
759  nvbio::vector<host_tag,uint8> BWT_ext_copy( n_suffixes_ext );
760  #pragma omp parallel for
761  for (int32 i = 0; i < int32( n_suffixes_ext ); ++i)
762  BWT_ext_copy[i] = BWT_ext[i];
763  #endif
764 
765  log_debug(stderr, " insert\n");
766  {
767  ScopedTimer<float> timer( &insert_time );
768 
769  // insert BWT[i] at g_sorted[i]
770  BWT_ext.insert( n_block_suffixes, &g_sorted[0], &block.h_BWT[0] );
771  }
772  log_debug(stderr, " insert dollars\n");
773  {
774  ScopedTimer<float> timer( &insert_dollars_time );
775 
776  // build h_dollar_pos
777  thrust::gather(
778  block.h_dollar_off.begin(),
779  block.h_dollar_off.begin() + n_block_strings,
780  g_sorted.begin(),
781  block.h_dollar_pos.begin() );
782 
783  // insert the block dollars in BWT_ext_dollars
784  BWT_ext_dollars.insert(
785  BWT_ext.size(),
786  n_block_suffixes,
787  raw_pointer( g_sorted ),
788  n_block_strings,
789  raw_pointer( block.h_dollar_off ),
790  raw_pointer( block.h_dollar_pos ),
791  raw_pointer( block.h_dollar_id ) );
792  }
793  log_verbose(stderr, " insert : %.1f M suffixes/s (symbols: %.1f%%, dollars: %.1f%%)\n",
794  (1.0e-6f * (n_processed_suffixes + n_block_suffixes)) / (insert_time + insert_dollars_time),
795  100.0f * insert_time / (insert_time + insert_dollars_time),
796  100.0f * insert_dollars_time / (insert_time + insert_dollars_time));
797 
798 #if defined(CHECK_INSERTION)
799  log_visible(stderr, " check insertions");
800  uint64 occ[SYMBOL_COUNT] = { 0 };
801  {
802  uint64 p = 0;
803  uint64 o = 0;
804  uint32 n_dollars = 0;
805  for (uint32 j = 0; j < n_block_suffixes; ++j)
806  {
807  if ((j % 1000) == 0)
808  log_visible(stderr, "\r check insertions... %6.2f%% ", 100.0f * float(j)/float(n_block_suffixes));
809 
810  const uint64 g_j = g_sorted[j];
811 
812  while (p < nvbio::min( g_j, n_suffixes_ext ))
813  {
814  if (BWT_ext[o] != BWT_ext_copy[p])
815  {
816  log_error(stderr, "insertion mismatch at o[%llu]:p[%llu]: expected %u, got %u\n", o, p,
817  uint32( BWT_ext_copy[p] ), uint32( BWT_ext[o] ));
818  exit(1);
819  }
820 
821  ++occ[ BWT_ext[o] ];
822  for (uint8 q = 0; q < SYMBOL_COUNT; ++q)
823  {
824  const uint64 r = BWT_ext.rank( o, q );
825  if (r != occ[q])
826  {
827  log_error(stderr, "ranks mismatch at c[%u], o[%llu]:p[%llu]: expected %llu, got %llu\n",
828  uint32(q), o, p, occ[q], r );
829  exit(1);
830  }
831  }
832 
833  ++o;
834  ++p;
835  }
836 
837  const uint32 cc = block.h_BWT[j] % SYMBOL_COUNT;
838  n_dollars += (block.h_BWT[j] == 255u) ? 1u : 0u;
839 
840  if (BWT_ext[o] != cc)
841  {
842  const uint32 ce = BWT_ext[o];
843  log_error(stderr, "insertion mismatch at o[%llu]:j[%u]:g[%llu]: expected %u, got %u (page[%u:%llu:%p])\n",
844  o, j, g_j,
845  cc, ce, BWT_ext.find_page(o), BWT_ext.m_offsets[ BWT_ext.find_page(o) ], BWT_ext.get_page( BWT_ext.find_page(o) ));
846  exit(1);
847  }
848  ++occ[ cc ];
849  for (uint8 q = 0; q < SYMBOL_COUNT; ++q)
850  {
851  const uint64 r = BWT_ext.rank( o, q );
852  if (r != occ[q])
853  {
854  log_error(stderr, "ranks mismatch at c[%u], o[%llu]:j[%llu]: expected %llu, got %llu\n",
855  uint32(q), o, p, occ[q], r );
856  exit(1);
857  }
858  }
859  ++o;
860  }
861  while (p < n_suffixes_ext)
862  {
863  if (BWT_ext[o] != BWT_ext_copy[p])
864  {
865  log_error(stderr, "insertion mismatch at o[%llu]:p[%llu]: expected %u, got %u\n", o, p,
866  uint32( BWT_ext_copy[p] ), uint32( BWT_ext[o] ));
867  exit(1);
868  }
869  ++occ[ BWT_ext[o] ];
870  for (uint8 q = 0; q < SYMBOL_COUNT; ++q)
871  {
872  const uint64 r = BWT_ext.rank( o, q );
873  if (r != occ[q])
874  {
875  log_error(stderr, "ranks mismatch at c[%u], o[%llu]:p[%llu]: expected %llu, got %llu\n",
876  uint32(q), o, p, occ[q], r );
877  exit(1);
878  }
879  }
880  ++o;
881  ++p;
882  }
883  }
884  log_visible(stderr, "\r check insertions \n");
885 #endif
886 }
887 
895 template <uint32 SYMBOL_SIZE, bool BIG_ENDIAN, typename storage_type, typename offsets_iterator>
896 void bwte(
897  const ConcatenatedStringSet<
898  PackedStream<storage_type,uint8,SYMBOL_SIZE,BIG_ENDIAN,typename std::iterator_traits<offsets_iterator>::value_type>,
899  offsets_iterator> string_set,
901  SparseSymbolSet& BWT_ext_dollars,
902  BWTParams* params)
903 {
905  typedef ConcatenatedStringSet<input_packed_stream_type,uint64*> input_string_set_type;
907 
908  // fetch the number of strings
909  const uint32 N = string_set.size();
910 
911  // compute the size of the final BWT
912  uint64 bwt_size = 0;
913  for (uint32 i = 0; i < N; ++i)
914  {
915  const input_string_type string = string_set[i];
916  const uint32 len = nvbio::length( string );
917 
918  bwt_size += len + 1u;
919  }
920 
921  // alloc enough storage for the BWT vectors
922  log_verbose(stderr, " allocating host BWT storage (%.1f GB)\n",
923  float( BWT_ext.needed_host_memory( bwt_size ) + N * sizeof(uint64) ) / float(1024*1024*1024) );
924 
925  BWT_ext.reserve( bwt_size );
926  BWT_ext_dollars.reserve( bwt_size, N );
927 
928  // prepare a BWTEContext
929  int current_device;
930  cudaGetDevice( &current_device );
932 
933  const uint32 max_block_suffixes = 256*1024*1024;
934  const uint32 max_block_strings = 16*1024*1024;
935 
936  bwte_context.reserve( max_block_strings, max_block_suffixes );
937 
938  const bool forward = true;
939 
940  // BWT merging loop
941  for (uint32 block_begin = forward ? 0 : N,
942  block_end = forward ? 0 : N;
943  forward ? (block_begin < N) :
944  (block_end > 0);
945  )
946  {
947  uint32 n_block_suffixes = 0;
948 
949  if (forward)
950  {
951  // go forward finding block_end so as to collect a given number of suffixes
952  for (uint32 i = block_begin; block_begin < N; ++i)
953  {
954  const input_string_type string = string_set[i];
955 
956  const uint32 string_len = nvbio::length( string );
957 
958  // check whether we need to stop growing the block
959  if (n_block_suffixes + string_len + 1u > max_block_suffixes ||
960  i - block_begin > max_block_strings)
961  break;
962 
963  n_block_suffixes += string_len + 1u;
964  block_end = i+1;
965  }
966  }
967  else
968  {
969  // go backwards finding block_begin so as to collect a given number of suffixes
970  for (int32 i = block_end-1; i >= 0; --i)
971  {
972  const input_string_type string = string_set[i];
973 
974  const uint32 string_len = nvbio::length( string );
975 
976  // check whether we need to stop growing the block
977  if (n_block_suffixes + string_len + 1u > max_block_suffixes ||
978  block_end - i > max_block_strings)
979  break;
980 
981  n_block_suffixes += string_len + 1u;
982  block_begin = i;
983  }
984  }
985 
986  log_info(stderr, " block [%u, %u] (%u suffixes)\n", block_begin, block_end, n_block_suffixes);
987 
988  bwte_context.append_block(
989  block_begin,
990  block_end,
991  string_set,
992  BWT_ext,
993  BWT_ext_dollars,
994  forward );
995 
996  if (forward)
997  block_begin = block_end;
998  else
999  block_end = block_begin;
1000  }
1001 }
1002 
1003 } // namespace nvbio