NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
paged_text_inl.h
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #pragma once
29 
30 namespace nvbio {
31 
32 uint32 popc_2bit(const uint64* base, const uint32 n, const uint32 mod, const uint32 c);
33 
36 template <typename word_type>
38 {
39  typedef word_type argument_type;
40  typedef word_type result_type;
41 
45  divide_ri_functor(const word_type _k) : k(_k) {}
46 
51  {
52  // divide i by k
53  return util::divide_ri( i, k );
54  }
55 
56  const word_type k;
57 };
58 
59 template <typename input_storage, typename output_storage, uint32 SYMBOL_SIZE, typename index_type>
61 void copy(
62  const uint32 len,
65 {
66  typedef typename std::iterator_traits<input_storage>::value_type word_type;
67 
68  const uint32 WORD_SIZE = 8u * sizeof(word_type);
69  const uint32 SYMBOLS_PER_WORD = WORD_SIZE / SYMBOL_SIZE;
70 
71  input_storage in_words = in.stream();
72  output_storage out_words = out.stream();
73 
74  const index_type k_start = in.index();
75  const index_type n_start = out.index();
76 
77  index_type k = k_start;
78  index_type n = n_start;
79 
80  const index_type k_end = k + len;
81 
82  const index_type out_word_begin = util::divide_ri( k, SYMBOLS_PER_WORD );
83  const index_type out_word_end = util::divide_rz( k_end, SYMBOLS_PER_WORD );
84 
85  // check whether the whole segment is contained in one word
86  if (out_word_end <= out_word_begin)
87  {
88  while (k < k_end)
89  out[k++ - k_start] = in[n++ - n_start];
90 
91  return;
92  }
93 
94  // align k to a word boundary
95  while (k < out_word_begin*SYMBOLS_PER_WORD)
96  out[k++ - k_start] = in[n++ - n_start];
97 
98  for (index_type out_word = out_word_begin; out_word < out_word_end; ++out_word)
99  {
100  // fetch a word's worth of input, starting from n
101  const uint32 n_word = n / SYMBOLS_PER_WORD;
102  const uint32 n_mod = n & (SYMBOLS_PER_WORD-1);
103  const uint32 n_syms = SYMBOLS_PER_WORD - n_mod;
104 
105  // fetch the last 'n_syms' symbols of the first word
106  word_type in_word = in_words[n_word] << (n_mod * SYMBOL_SIZE);
107 
108  if (n_syms < SYMBOLS_PER_WORD)
109  {
110  // fetch the remaining symbols from the next word (deleting the first 'n_syms)
111  in_word |= (in_words[n_word+1] >> (n_syms * SYMBOL_SIZE));
112  }
113 
114  // ...and write it out
115  out_words[ out_word ] = in_word;
116 
117  // go forward
118  k += SYMBOLS_PER_WORD;
119  n += SYMBOLS_PER_WORD;
120  }
121 
122  // finish copying leftovers
123  while (k < k_end)
124  out[k++ - k_start] = in[n++ - n_start];
125 }
126 
127 // save occurrence counters if needed
128 template <uint32 SYMBOL_COUNT>
130 void save_occurrences(const uint32 k, const uint32 occ_intv_log, const uint32 occ_intv, const uint32* partials, uint32* occ)
131 {
132  // check whether we need to save the occurrence counters
133  if ((k & (occ_intv-1)) == 0)
134  {
135  const uint32 block_idx = k >> occ_intv_log;
136  for (uint32 q = 0; q < SYMBOL_COUNT; ++q)
137  occ[ SYMBOL_COUNT * block_idx + q ] = partials[q];
138  }
139 }
140 
141 template <typename input_storage, typename output_storage, uint32 SYMBOL_SIZE, typename index_type>
142 NVBIO_FORCEINLINE NVBIO_HOST/*_DEVICE*/
143 void copy(
144  const uint32 len,
147  const uint32 occ_intv_log,
148  const uint32 occ_intv,
149  uint32* partials,
150  uint32* occ,
151  const uint32* count_table)
152 {
153  typedef typename std::iterator_traits<input_storage>::value_type word_type;
154 
155  const uint32 WORD_SIZE = 8u * sizeof(word_type);
156  const uint32 SYMBOLS_PER_WORD = WORD_SIZE / SYMBOL_SIZE;
157  const uint32 SYMBOL_COUNT = 1u << SYMBOL_SIZE;
158 
159  input_storage in_words = in.stream();
160  output_storage out_words = out.stream();
161 
162  const index_type k_start = out.index();
163  const index_type n_start = in.index();
164 
165  index_type k = k_start;
166  index_type n = n_start;
167 
168  const index_type k_end = k + len;
169 
170  const index_type out_word_begin = util::divide_ri( k, SYMBOLS_PER_WORD );
171  const index_type out_word_end = util::divide_rz( k_end, SYMBOLS_PER_WORD );
172 
173  // check whether the whole segment is contained in one word
174  if (out_word_end <= out_word_begin)
175  {
176  while (k < k_end)
177  {
178  // check whether we need to save the occurrence counters
179  save_occurrences<SYMBOL_COUNT>( k, occ_intv_log, occ_intv, partials, occ );
180 
181  const uint8 c = in[n++ - n_start];
182 
183  out[k++ - k_start] = c;
184 
185  ++partials[c];
186  }
187  return;
188  }
189 
191 
192  // align k to a word boundary
193  while (k < out_word_begin*SYMBOLS_PER_WORD)
194  {
195  // check whether we need to save the occurrence counters
196  save_occurrences<SYMBOL_COUNT>( k, occ_intv_log, occ_intv, partials, occ );
197 
198  const uint8 c = in[n++ - n_start];
199 
200  out[k++ - k_start] = c;
201 
202  ++partials[c];
203  }
204 
205  for (index_type out_word = out_word_begin; out_word < out_word_end; ++out_word)
206  {
207  // fetch a word's worth of input, starting from n
208  const uint32 n_word = n / SYMBOLS_PER_WORD;
209  const uint32 n_mod = n & (SYMBOLS_PER_WORD-1);
210  const uint32 n_syms = SYMBOLS_PER_WORD - n_mod;
211 
212  // fetch the last 'n_syms' symbols of the first word
213  word_type in_word = in_words[n_word] << (n_mod * SYMBOL_SIZE);
214 
215  if (n_syms < SYMBOLS_PER_WORD)
216  {
217  // fetch the remaining symbols from the next word (deleting the first 'n_syms)
218  in_word |= (in_words[n_word+1] >> (n_syms * SYMBOL_SIZE));
219  }
220 
221  // check whether we need to save the occurrence counters
222  save_occurrences<SYMBOL_COUNT>( out_word * SYMBOLS_PER_WORD, occ_intv_log, occ_intv, partials, occ );
223 
224  if (SYMBOL_SIZE == 2)
225  {
226  const uint32 cnts = popc_2bit_all( in_word, count_table );
227 
228  partials[0] += (cnts >> 0) & 0xFF;
229  partials[1] += (cnts >> 8) & 0xFF;
230  partials[2] += (cnts >> 16) & 0xFF;
231  partials[3] += (cnts >> 24) & 0xFF;
232  }
233  else
234  {
235  // loop through the symbols one by one
236  for (uint32 i = 0; i < SYMBOLS_PER_WORD; ++i)
237  {
238  const uint8 c = (in_word >> (WORD_SIZE - SYMBOL_SIZE - i * SYMBOL_SIZE)) & (SYMBOL_COUNT-1);
239  ++partials[c];
240  }
241  // TODO: use a generalized count-table
242  }
243 
244  // ...and write it out
245  out_words[ out_word ] = in_word;
246 
247  // go forward
248  k += SYMBOLS_PER_WORD;
249  n += SYMBOLS_PER_WORD;
250  }
251 
252  // finish copying leftovers
253  while (k < k_end)
254  {
255  // check whether we need to save the occurrence counters
256  save_occurrences<SYMBOL_COUNT>( k, occ_intv_log, occ_intv, partials, occ );
257 
258  const uint8 c = in[n++ - n_start];
259 
260  out[k++ - k_start] = c;
261 
262  ++partials[c];
263  }
264 }
265 
266 // constructor
267 //
268 template <uint32 SYMBOL_SIZE_T, bool BIG_ENDIAN_T>
270  const uint32 page_size,
271  const uint32 segment_size,
272  const uint32 occ_intv) :
273  m_page_size( page_size / sizeof(word_type) ),
274  m_segment_size( segment_size / sizeof(word_type) ),
275  m_occ_intv( occ_intv ),
276  m_occ_intv_w( occ_intv / SYMBOLS_PER_WORD ),
277  m_occ_intv_log( nvbio::log2( occ_intv ) ),
278  m_page_count( 0 ),
279  m_pool_size( 0 )
280 {
281  omp_init_lock( &m_lock );
282 
284 }
285 
286 // destructor
287 //
288 template <uint32 SYMBOL_SIZE_T, bool BIG_ENDIAN_T>
290 {
291  for (uint32 i = 0; i < m_segments.size(); ++i)
292  free( m_segments[i] );
293 }
294 
295 // alloc new pages
296 //
297 template <uint32 SYMBOL_SIZE_T, bool BIG_ENDIAN_T>
299 {
300  const uint32 occ_freq = (sizeof(word_type) / sizeof(uint32)) *
301  (m_occ_intv / SYMBOLS_PER_WORD) / SYMBOL_COUNT;
302 
303  const uint32 n_pages = m_segment_size / m_page_size;
304  const uint32 ext_page_size = m_page_size + m_page_size / occ_freq;
305  const uint32 ext_segment_size = n_pages * ext_page_size;
306 
307  word_type* segment = (word_type*)malloc( ext_segment_size * sizeof(word_type) );
308 
309  if (segment == NULL)
310  {
311  log_error(stderr, "PagedText: failed allocating segment\n");
312  //throw bad_alloc( "PagedText: failed allocating segment\n" );
313  exit(1);
314  }
315  else
316  {
317  // the free page pool has to be able to accomodate all the allocated pages
318  if (m_pool.size() < m_page_count + n_pages)
319  m_pool.resize( m_page_count + n_pages );
320 
321  m_segments.push_back( segment );
322 
323  for (uint32 i = 0; i < n_pages; ++i)
324  m_pool[ m_pool_size++ ] = segment + ext_page_size * (n_pages - i - 1u);
325 
326  m_page_count += n_pages;
327  }
328 }
329 
330 // alloc a new page
331 //
332 template <uint32 SYMBOL_SIZE_T, bool BIG_ENDIAN_T>
335 {
336  omp_set_lock( &m_lock );
337 
338  if (m_pool_size == 0)
339  {
340  omp_unset_lock( &m_lock );
341 
342  log_error(stderr, "PagedText: exhausted page pool\n");
343  //throw bad_alloc( "PagedText: exhausted page pool\n" );
344  exit(1);
345  }
346 
347  word_type* page = m_pool[ --m_pool_size ];
348  assert( page != NULL );
349 
350  omp_unset_lock( &m_lock );
351  return page;
352 }
353 
354 // release a page
355 //
356 template <uint32 SYMBOL_SIZE_T, bool BIG_ENDIAN_T>
358 {
359  assert( page != NULL );
360  omp_set_lock( &m_lock );
361 
362  if (m_pool_size >= m_page_count)
363  {
364  log_error(stderr, "exceeded pool size %u - released more pages than have been allocated\n", m_page_count);
365  exit(1);
366  }
367 
368  m_pool[ m_pool_size++ ] = page;
369 
370  omp_unset_lock( &m_lock );
371 }
372 
373 // indexing operator - return the i-th symbol
374 //
375 template <uint32 SYMBOL_SIZE_T, bool BIG_ENDIAN_T>
377 {
378  // fetch the page containing index 'i'
379  const uint32 page_idx = find_page( i );
380  const word_type* page = get_page( page_idx );
381 
382  const uint32 local_i = uint32( i - m_offsets[ page_idx ] );
383  assert( local_i < m_page_size * SYMBOLS_PER_WORD );
384 
385  const const_packed_page_type packed_page( page );
386  return packed_page[ local_i ];
387 }
388 
389 // compute the rank of c in [0,i]
390 //
391 template <uint32 SYMBOL_SIZE_T, bool BIG_ENDIAN_T>
393 {
394  if (i == uint64(-1))
395  return 0u;
396 
397  if (i >= size())
398  return symbol_frequency(c);
399 
400  // fetch the page containing index 'i'
401  const uint32 page_idx = find_page( i );
402 
403  return rank( page_idx, i, c );
404 }
405 
406 // compute the rank of c in [0,i]
407 //
408 template <uint32 SYMBOL_SIZE_T, bool BIG_ENDIAN_T>
410 {
411  if (i == uint64(-1))
412  return 0u;
413 
414  if (i >= size())
415  return symbol_frequency(c);
416 
417  // fetch the page containing index 'i'
418  const word_type* page = get_page( page_idx );
419  const uint32* occ = (uint32*)( page + m_page_size );
420 
421  const uint32 local_i = uint32( i - m_offsets[ page_idx ] );
422  assert( local_i < m_page_size * SYMBOLS_PER_WORD );
423 
424  // compute the index of the occurrence block containing 'i', and the offset within it
425  const uint32 block_idx = local_i >> m_occ_intv_log;
426  const uint32 block_offset = local_i & (m_occ_intv-1);
427 
428  // fetch the base occurrence counters for the page and block
429  uint64 out =
430  m_counters[ SYMBOL_COUNT * page_idx + c ] +
431  occ[ SYMBOL_COUNT * block_idx + c ];
432 
433  // compute the index of the word containing 'i', and the corresponding modulo
434  const uint32 word_idx = block_offset / SYMBOLS_PER_WORD;
435  const uint32 word_mod = ~word_type(local_i) & (SYMBOLS_PER_WORD-1);
436 
437  const uint32 word_begin = block_idx*m_occ_intv_w;
438 
439  return out + popc_2bit( page + word_begin, word_idx, word_mod, c );
440 }
441 
442 // reserve
443 //
444 template <uint32 SYMBOL_SIZE_T, bool BIG_ENDIAN_T>
446 {
447  // alloc the pages we need
448  m_pool.reserve( n_pages );
449  while (m_page_count < n_pages)
450  grow();
451 
452  m_pages.reserve( n_pages );
453  m_new_pages.reserve( n_pages );
454  m_counters.reserve( (n_pages+1) * SYMBOL_COUNT );
455  m_new_counters.reserve( (n_pages+1) * SYMBOL_COUNT );
456  m_offsets.reserve( n_pages + 1 );
457  m_new_offsets.reserve( n_pages + 1 );
458 }
459 
460 // reserve free pages
461 //
462 template <uint32 SYMBOL_SIZE_T, bool BIG_ENDIAN_T>
464 {
465  // alloc the pages we need
466  m_pool.reserve( n_pages );
467  while (m_pool_size < n_pages)
468  grow();
469 }
470 
471 // reserve
472 //
473 template <uint32 SYMBOL_SIZE_T, bool BIG_ENDIAN_T>
475 {
476  reserve_pages( (uint32)util::divide_ri( n, (m_page_size * SYMBOLS_PER_WORD * 2)/3 ) );
477 }
478 
479 // needed host memory
480 //
481 template <uint32 SYMBOL_SIZE_T, bool BIG_ENDIAN_T>
483 {
484  const uint32 occ_freq = (sizeof(word_type) / sizeof(uint32)) *
485  (m_occ_intv / SYMBOLS_PER_WORD) / SYMBOL_COUNT;
486 
487  const uint32 ext_page_size = m_page_size + m_page_size / occ_freq;
488 
489  const uint64 n_pages = util::divide_ri( n, (m_page_size * SYMBOLS_PER_WORD * 2)/3 );
490  return n_pages * ext_page_size * sizeof(word_type);
491 }
492 
493 // resize and copy a given vector
494 //
495 template <uint32 SYMBOL_SIZE_T, bool BIG_ENDIAN_T>
497 {
498  const uint32 PAGE_SYMBOLS = m_page_size * SYMBOLS_PER_WORD;
499 
500  // alloc the given number of pages
501  const uint32 n_pages = util::divide_ri( n, PAGE_SYMBOLS );
502 
503  reserve_pages( n_pages );
504 
505  // alloc the pages we need
506  m_pages.resize( n_pages );
507  for (uint32 i = 0; i < n_pages; ++i)
508  m_pages[i] = alloc_page();
509 
510  // setup the page offsets
511  m_offsets.resize( n_pages + 1 );
512 
513  #pragma omp parallel for
514  for (int32 i = 0; i < int32(n_pages); ++i)
515  m_offsets[i] = uint64(i) * PAGE_SYMBOLS;
516 
517  m_offsets[ n_pages ] = n;
518 
519  // setup the symbol counters
520  m_counters.resize( (n_pages+1) * SYMBOL_COUNT, uint64(0) );
521 
522  if (c != NULL)
523  {
524  #pragma omp parallel for
525  for (int32 i = 0; i < int32(n_pages); ++i)
526  {
527  const uint64 begin = uint64(i) * PAGE_SYMBOLS;
528  const uint64 end = nvbio::min( n, begin + PAGE_SYMBOLS );
529 
530  // get a new page
531  word_type* page_storage = m_pages[i];
532 
533  packed_page_type page( page_storage );
534 
535  // fill the page contents
536  nvbio::assign( uint32( end - begin ), c + begin, page );
537 
538  // update the occurrence counters
539  uint64* cnts = &m_counters[ i * SYMBOL_COUNT ];
540  uint32* occ = (uint32*)( page_storage + m_page_size );
541 
542  for (uint32 j = 0; j < uint32( end - begin ); ++j)
543  {
544  // check whether we need to the save the occurrence counters
545  if ((j & (m_occ_intv-1)) == 0)
546  {
547  for (uint32 q = 0; q < SYMBOL_COUNT; ++q)
548  occ[q] = cnts[q];
549 
550  occ += SYMBOL_COUNT;
551  }
552 
553  const uint8 cc = c[ begin + j ] & (SYMBOL_COUNT-1);
554  ++cnts[ cc ];
555  }
556  }
557  }
558 
559  // do an exclusive prefix-sum on the occurrence counters
560  nvbio::vector<host_tag,uint8> temp_storage;
561  for (uint32 j = 0; j < SYMBOL_COUNT; ++j)
562  {
564  n_pages+1,
565  make_strided_iterator( &m_counters[j], SYMBOL_COUNT ),
566  make_strided_iterator( &m_counters[j], SYMBOL_COUNT ),
567  thrust::plus<uint64>(),
568  uint64(0),
569  temp_storage );
570  }
571 
572  // do an error check on the occurrence counters
573  const uint64* cnts = symbol_frequencies();
574  uint64 n_occ = 0;
575  for (uint32 j = 0; j < SYMBOL_COUNT; ++j)
576  n_occ += cnts[j];
577 
578  if (n_occ != n)
579  {
580  log_error(stderr, "mismatching occurrence counters: expected %llu symbols, got %llu\n", n, n_occ );
581  //throw runtime_error( "mismatching occurrence counters: expected %llu symbols, got %llu\n", n, n_occ );
582  exit(1);
583  }
584 
585  build_buckets( m_offsets.back(), (uint32)m_offsets.size(), &m_offsets[0], BUCKET_SIZE, m_buckets );
586 }
587 
588 
589 template <uint32 SYMBOL_SIZE, bool BIG_ENDIAN>
591 {
592  static const uint32 SYMBOL_COUNT = 1u << SYMBOL_SIZE;
593 
598 
600  const uint32 _N,
601  const uint32 _in_leaves,
602  const uint32* _leaf_ids,
603  const uint64* _g,
604  const uint8* _c,
605  paged_text_type* _text) :
606  N ( _N ),
607  in_leaves ( _in_leaves ),
608  leaf_ids ( _leaf_ids ),
609  g ( _g ),
610  c ( _c ),
611  text ( _text )
612  {}
613 
616  const uint32 k_out,
617  uint32* partials,
618  uint32* out_leaf,
619  word_type** out_page,
620  packed_page_type* out_stream,
621  uint32** occ) const
622  {
623  // write out the old partials
624  for (uint32 q = 0; q < SYMBOL_COUNT; ++q)
625  text->m_new_counters[ SYMBOL_COUNT * *out_leaf + q ] = partials[q];
626 
627  // do a tiny error check
628  {
629  const uint32 cnt = std::accumulate( partials, partials + SYMBOL_COUNT, 0u );
630 
631  if (cnt != k_out)
632  {
633  log_error(stderr, "alloc_page(%u) : expected %u occurrences, got %u\n", *out_leaf, k_out, cnt);
634  //throw runtime_error( "alloc_page(%u) : expected %u occurrences, got %u\n", *out_leaf, k_out, cnt );
635  exit(1);
636  }
637  }
638 
639  // and reset them
640  for (uint32 q = 0; q < SYMBOL_COUNT; ++q)
641  partials[q] = 0;
642 
643  // alloc a new page
644  (*out_leaf)++;
645  *out_page = text->alloc_page();
646  *out_stream = packed_page_type( *out_page );
647  *occ = (uint32*)( *out_page + text->m_page_size );
648 
649  // write out the new leaf offset
650  text->m_new_offsets[ *out_leaf ] = text->m_new_offsets[ *out_leaf-1 ] + k_out;
651 
652  // save the new page
653  text->m_new_pages[ *out_leaf ] = *out_page;
654  }
655 
656  void operator() (const uint32 in_leaf) const
657  {
658  NVBIO_VAR_UNUSED const uint32 LEAF_SYMBOLS = text->m_page_size * paged_text_type::SYMBOLS_PER_WORD;
659 
660  uint32 out_leaf = leaf_ids[ in_leaf ];
661  const uint32 out_leaf_begin = leaf_ids[ in_leaf ];
662  const uint32 out_leaf_end = leaf_ids[ in_leaf+1u ];
663  const uint64 in_leaf_begin = text->m_offsets[ in_leaf ];
664  const uint64 in_leaf_end = text->m_offsets[ in_leaf + 1u ];
665  const uint32 in_leaf_size = in_leaf_end - in_leaf_begin;
666 
667  const uint32 g_begin = lower_bound_index( in_leaf_begin, g, N );
668  const uint32 g_end = in_leaf < in_leaves-1 ?
669  lower_bound_index( in_leaf_end, g, N ) : N;
670 
671  if (g_begin == g_end)
672  {
673  //
674  // special case: this leaf got no insertions and doesn't need to be copied
675  //
676 
677  word_type* in_page = text->m_pages[ in_leaf ];
678 
679  // write out the new leaf offset
680  text->m_new_offsets[ out_leaf ] = in_leaf_begin + g_begin;
681 
682  // save the new page
683  text->m_new_pages[ out_leaf ] = in_page;
684 
685  // write out the new counters
686  for (uint32 q = 0; q < SYMBOL_COUNT; ++q)
687  {
688  text->m_new_counters[ SYMBOL_COUNT * out_leaf + q ] =
689  text->m_counters[ SYMBOL_COUNT * (in_leaf+1u) + q ] -
690  text->m_counters[ SYMBOL_COUNT * (in_leaf+0u) + q ];
691  }
692  return;
693  }
694 
695  word_type* in_page = text->m_pages[ in_leaf ];
696  word_type* out_page = text->alloc_page();
697  uint32* occ = (uint32*)( out_page + text->m_page_size );
698 
699  const_packed_page_type in_stream( in_page );
700  packed_page_type out_stream( out_page );
701 
702  // compute the maximum number of elements we'll place in each page
703  const uint32 elements_per_page = util::divide_ri( in_leaf_size + g_end - g_begin, out_leaf_end - out_leaf_begin );
704 
705  // write out the new leaf offset
706  text->m_new_offsets[ out_leaf ] = in_leaf_begin + g_begin;
707 
708  // save the new page
709  text->m_new_pages[ out_leaf ] = out_page;
710 
711  uint32 k_in = 0u;
712  uint32 k_out = 0u;
713 
714  uint32 partials[SYMBOL_COUNT];
715  for (uint32 q = 0; q < SYMBOL_COUNT; ++q)
716  partials[q] = 0;
717 
718  for (uint32 j = g_begin; j < g_end; ++j)
719  {
720  // fetch the next insertion
721  const uint32 g_pos = uint32( g[j] - in_leaf_begin );
722  const uint8 cc = c[j] & (SYMBOL_COUNT-1);
723  //assert( g[j] >= in_leaf_begin && g[j] < in_leaf_end );
724 
725  if (g_pos - k_in)
726  {
727  const uint32 m = nvbio::min( g_pos, in_leaf_size ) - k_in;
728  assert( m <= LEAF_SYMBOLS );
729 
730  // the current page can hold only the first 'r' symbols
731  const uint32 r = nvbio::min( elements_per_page - k_out, m );
732 
733  copy( r, in_stream + k_in, out_stream + k_out, text->m_occ_intv_log, text->m_occ_intv, partials, occ, text->m_count_table );
734 
735  k_out += r;
736 
737  // check if we need a new page
738  if (r < m)
739  {
740  alloc_page(
741  k_out,
742  partials,
743  &out_leaf,
744  &out_page,
745  &out_stream,
746  &occ );
747 
748  // copy the remaining symbols
749  copy( m - r, in_stream + k_in + r, out_stream, text->m_occ_intv_log, text->m_occ_intv, partials, occ, text->m_count_table );
750 
751  k_out = m - r;
752  }
753 
754  k_in = g_pos;
755  }
756 
757  if (k_out < elements_per_page)
758  {
759  // save current occurrence counters
760  save_occurrences<SYMBOL_COUNT>( k_out, text->m_occ_intv_log, text->m_occ_intv, partials, occ );
761 
762  out_stream[ k_out++ ] = cc;
763  }
764  else
765  {
766  alloc_page(
767  k_out,
768  partials,
769  &out_leaf,
770  &out_page,
771  &out_stream,
772  &occ );
773 
774  k_out = 0u;
775 
776  // save current occurrence counters
777  save_occurrences<SYMBOL_COUNT>( k_out, text->m_occ_intv_log, text->m_occ_intv, partials, occ );
778 
779  out_stream[ k_out++ ] = cc;
780  }
781  // update partial occurrence counters
782  ++partials[ cc ];
783  }
784 
785  if (in_leaf_size > k_in)
786  {
787  const uint32 m = in_leaf_size - k_in;
788  assert( m <= LEAF_SYMBOLS );
789 
790  // the current page can hold only the first 'r' symbols
791  const uint32 r = nvbio::min( elements_per_page - k_out, m );
792 
793  copy( r, in_stream + k_in, out_stream + k_out, text->m_occ_intv_log, text->m_occ_intv, partials, occ, text->m_count_table );
794 
795  k_out += r;
796 
797  // check if we need a new page
798  if (r < m)
799  {
800  alloc_page(
801  k_out,
802  partials,
803  &out_leaf,
804  &out_page,
805  &out_stream,
806  &occ );
807 
808  // copy the remaining symbols
809  copy( m - r, in_stream + k_in + r, out_stream, text->m_occ_intv_log, text->m_occ_intv, partials, occ, text->m_count_table );
810  }
811  }
812 
813  // write out the final partials
814  for (uint32 q = 0; q < SYMBOL_COUNT; ++q)
815  text->m_new_counters[ SYMBOL_COUNT * out_leaf + q ] = partials[q];
816 
817  // release the input page
818  text->release_page( in_page );
819 
820  if (out_leaf+1 != out_leaf_end)
821  {
822  log_error(stderr, "mismatching number of output leaves: leaf[%u/%u] : expected %u, got %u\n",
823  in_leaf, in_leaves,
824  out_leaf_end - out_leaf_begin,
825  out_leaf - out_leaf_begin);
826  log_error(stderr, " in-size : %u\n", in_leaf_size);
827  log_error(stderr, " insertions : %u\n", uint32( g_end - g_begin ));
828 
829  //throw runtime_error( "mismatching number of output leaves: leaf[%u/%u] : expected %u, got %u\n",
830  // in_leaf, in_leaves,
831  // out_leaf_end - out_leaf_begin,
832  // out_leaf - out_leaf_begin );
833  exit(1);
834  }
835  }
836 
837  const uint32 N;
839  const uint32* leaf_ids;
840  const uint64* g;
841  const uint8* c;
843 };
844 
845 template <uint32 SYMBOL_SIZE, bool BIG_ENDIAN>
847 {
848  static const uint32 SYMBOL_COUNT = 1u << SYMBOL_SIZE;
849 
854 
856  const uint64 _N,
857  const uint32 _out_leaves,
858  const uint32 _in_leaves,
859  paged_text_type* _text) :
860  N ( _N ),
861  out_leaves ( _out_leaves ),
862  in_leaves ( _in_leaves ),
863  text ( _text )
864  {}
865 
866  void operator() (const uint32 out_leaf) const
867  {
868  NVBIO_VAR_UNUSED const uint32 LEAF_SYMBOLS = text->m_page_size * paged_text_type::SYMBOLS_PER_WORD;
869 
870  const uint64 out_leaf_begin = uint64( out_leaf ) * LEAF_SYMBOLS;
871  const uint64 out_leaf_end = nvbio::min( uint64( out_leaf + 1u ) * LEAF_SYMBOLS, N );
872  const uint32 out_leaf_size = uint32( out_leaf_end - out_leaf_begin );
873 
874  // alloc a new page
875  word_type* out_page = text->alloc_page();
876  packed_page_type out_stream( out_page );
877  uint32* occ = (uint32*)( out_page + text->m_page_size );
878 
879  // write out the new leaf offset
880  text->m_new_offsets[ out_leaf ] = out_leaf_begin;
881 
882  // save the new page
883  text->m_new_pages[ out_leaf ] = out_page;
884 
885  uint32 partials[SYMBOL_COUNT];
886  for (uint32 q = 0; q < SYMBOL_COUNT; ++q)
887  partials[q] = 0;
888 
889  uint32 k_out = 0;
890 
891  uint32 in_leaf = upper_bound_index( out_leaf_begin, raw_pointer( text->m_offsets ), in_leaves+1 ) - 1u;
892 
893  for (; k_out < out_leaf_size && in_leaf < in_leaves; ++in_leaf)
894  {
895  const uint64 in_leaf_begin = text->m_offsets[ in_leaf ];
896  const uint64 in_leaf_end = text->m_offsets[ in_leaf+1 ];
897  const uint32 in_leaf_size = uint32( in_leaf_end - in_leaf_begin );
898 
899  const word_type* in_page = text->m_pages[ in_leaf ];
900  const const_packed_page_type in_stream( in_page );
901  assert( in_page != NULL );
902 
903  const uint32 k_in = in_leaf_begin >= out_leaf_begin ? 0u : uint32( out_leaf_begin - in_leaf_begin );
904  const uint32 r = nvbio::min( out_leaf_size - k_out, in_leaf_size - k_in );
905 
906  copy( r, in_stream + k_in, out_stream + k_out, text->m_occ_intv_log, text->m_occ_intv, partials, occ, text->m_count_table );
907 
908  k_out += r;
909  }
910 
911  // write out the final partials
912  for (uint32 q = 0; q < SYMBOL_COUNT; ++q)
913  text->m_new_counters[ SYMBOL_COUNT * out_leaf + q ] = partials[q];
914 
915  const uint32 cnt = std::accumulate( partials, partials + SYMBOL_COUNT, 0u );
916  if (cnt != k_out)
917  {
918  log_error(stderr, "merge_pages(%u) : expected %u occurrences, got %u\n", out_leaf, k_out, cnt);
919  //throw runtime_error( "merge_pages(%u) : expected %u occurrences, got %u\n", out_leaf, k_out, cnt );
920  exit(1);
921  }
922  }
923 
924  const uint64 N;
928 };
929 
930 // perform a batch of parallel insertions
931 //
932 template <uint32 SYMBOL_SIZE_T, bool BIG_ENDIAN_T>
933 inline void PagedText<SYMBOL_SIZE_T,BIG_ENDIAN_T>::insert(const uint32 n, const uint64* g, const uint8* c)
934 {
935  const uint32 LEAF_SYMBOLS = m_page_size * SYMBOLS_PER_WORD;
936 
937  const uint32 n_leaves = m_offsets.size() - 1u;
938 
939  Timer timer;
940  timer.start();
941 
942  nvbio::vector<host_tag,uint32> leaf_sizes( n_leaves + 1u );
943  nvbio::vector<host_tag,uint32> new_leaf_ids( n_leaves + 1u );
944  nvbio::vector<host_tag,uint8> temp_storage;
945 
946  timer.stop();
947  NVBIO_VAR_UNUSED const float t0 = timer.seconds();
948 
949  timer.start();
950 
951  // compute the original leaf sizes
952  thrust::adjacent_difference(
953  m_offsets.begin(),
954  m_offsets.begin() + n_leaves + 1u,
955  leaf_sizes.begin() );
956 
957  // extend the last leaf to infinity
958  const uint64 old_size = m_offsets.back();
959  m_offsets.back() = uint64(-1);
960 
961  if (n < n_leaves)
962  {
963  nvbio::vector<host_tag,uint32> g_leaves( n );
964  nvbio::vector<host_tag,uint32> ins_leaves( n );
965  nvbio::vector<host_tag,uint32> ins_counts( n );
966 
967  // find how many elements of g fall in each leaf
968  nvbio::upper_bound<host_tag>(
969  n,
970  g,
971  n_leaves + 1u,
972  m_offsets.begin(),
973  g_leaves.begin() );
974 
975  // compute which leaves need splitting, and how much
976  const uint32 n_touched = nvbio::runlength_encode(
977  n,
978  g_leaves.begin(),
979  ins_leaves.begin(),
980  ins_counts.begin(),
981  temp_storage );
982 
983  log_debug(stderr, " touched leaves %u, (%.2f%% - %.1fMB)\n", n_touched, 100.0f * float(n_touched) / float(n_leaves), float(n_touched)*(m_page_size*sizeof(word_type)) / float(1024*1024));
984 
985  // for each leaf l = h_ins_leaves[i], do h_leaf_sizes[l] += h_ins_counts[i]
986  nvbio::transform<host_tag>(
987  n_touched,
988  thrust::make_permutation_iterator( leaf_sizes.begin(), ins_leaves.begin() ),
989  ins_counts.begin(),
990  thrust::make_permutation_iterator( leaf_sizes.begin(), ins_leaves.begin() ),
991  thrust::plus<uint32>() );
992  }
993  else
994  {
995  nvbio::vector<host_tag,uint32> g_leaves( n_leaves + 1u );
996  nvbio::vector<host_tag,uint32> ins_counts( n_leaves + 1u );
997 
998  // for each leaf, find how many elements of g fall inside it
999  nvbio::lower_bound<host_tag>(
1000  n_leaves + 1u,
1001  m_offsets.begin(),
1002  n,
1003  g,
1004  g_leaves.begin() );
1005 
1006  // make sure that the last leaf includes all elements in g greater than the current size
1007  g_leaves[ n_leaves ] = n;
1008 
1009  // compute the number of insertions in each leaf
1010  thrust::adjacent_difference(
1011  g_leaves.begin(),
1012  g_leaves.begin() + n_leaves + 1u,
1013  ins_counts.begin() );
1014 
1015  // for each leaf do h_leaf_sizes[i] += h_ins_counts[i]
1016  nvbio::transform<host_tag>(
1017  n_leaves + 1u,
1018  leaf_sizes.begin(),
1019  ins_counts.begin(),
1020  leaf_sizes.begin(),
1021  thrust::plus<uint32>() );
1022 
1023  /*
1024  const uint32 n_split = nvbio::reduce(
1025  n_leaves,
1026  make_cast_iterator<uint32>(
1027  thrust::make_transform_iterator(
1028  thrust::make_transform_iterator( leaf_sizes.begin() + 1u, divide_ri_functor<uint32>( LEAF_SYMBOLS ) ),
1029  not_equal_to_functor<uint32>(1u) ) ),
1030  thrust::plus<uint32>(),
1031  temp_storage );
1032  log_debug(stderr, " split leaves %u (%.2f%%)\n", n_split, 100.0f * float(n_split) / float(n_leaves));
1033 
1034  const uint32 n_touched = nvbio::reduce(
1035  n_leaves,
1036  make_cast_iterator<uint32>(
1037  thrust::make_transform_iterator( ins_counts.begin() + 1u, not_equal_to_functor<uint32>(0u) ) ),
1038  thrust::plus<uint32>(),
1039  temp_storage );
1040  log_debug(stderr, " touched leaves %u (%.2f%% - %.1fMB)\n", n_touched, 100.0f * float(n_touched) / float(n_leaves), float(n_touched)*(m_page_size*sizeof(word_type)) / float(1024*1024));
1041  */
1042  }
1043 
1044  // reset the end of the last leaf
1045  m_offsets.back() = old_size;
1046 
1047  //
1048  // at this point, each old leaf will be split in util::divide_ri( h_leaf_sizes[i], LEAF_SYMBOLS )
1049  // new leaves
1050  //
1051 
1052  // do a prefix sum to compute the new leaf numbering
1054  n_leaves + 1u,
1055  thrust::make_transform_iterator( leaf_sizes.begin(), divide_ri_functor<uint32>( LEAF_SYMBOLS ) ),
1056  new_leaf_ids.begin(),
1057  thrust::plus<uint32>(),
1058  temp_storage );
1059 
1060  const uint32 out_leaves = new_leaf_ids[ n_leaves ];
1061 
1062  // alloc a new set of page pointers and offsets
1063  m_new_pages.resize( out_leaves );
1064  m_new_offsets.resize( out_leaves+1 );
1065  m_new_counters.resize( (out_leaves+1) * SYMBOL_COUNT, uint64(0) );
1066 
1067  timer.stop();
1068  NVBIO_VAR_UNUSED const float t1 = timer.seconds();
1069 
1070  timer.start();
1071 
1072  const uint32 BATCH_SIZE = 4*1024;
1073 
1074  reserve_pages( out_leaves + nvbio::min( n_leaves, BATCH_SIZE ) );
1075 
1076  timer.stop();
1077  NVBIO_VAR_UNUSED const float t2 = timer.seconds();
1078 
1079  const float utilization = (float( size() + n ) / float(LEAF_SYMBOLS)) / float( out_leaves );
1080 
1081  log_debug(stderr, " copy pages %u -> %u (utilization : %.1f%%)\n",
1082  n_leaves, out_leaves,
1083  100.0f * utilization );
1084 
1085  timer.start();
1086 
1087  const copy_insert_pages<SYMBOL_SIZE,BIG_ENDIAN> copy_functor(
1088  n,
1089  n_leaves,
1090  nvbio::raw_pointer( new_leaf_ids ),
1091  g,
1092  c,
1093  this );
1094 
1095  for (uint32 batch_begin = 0; batch_begin < n_leaves; batch_begin += BATCH_SIZE)
1096  {
1097  const uint32 batch_end = nvbio::min( n_leaves, batch_begin + BATCH_SIZE );
1098 
1099  //log_verbose(stderr, " block[%u:%u] (pool: %u)\n", batch_begin, batch_end, m_pool_size);
1100 
1101  // fill the new leaves (one thread per input leaf)
1102  nvbio::for_each<host_tag>(
1103  batch_end - batch_begin,
1104  thrust::make_counting_iterator<uint32>( batch_begin ),
1105  copy_functor );
1106  }
1107 
1108  timer.stop();
1109  NVBIO_VAR_UNUSED const float t3 = timer.seconds();
1110  //log_verbose(stderr, " %.2f insertions/s (%.1f%%, %.1f%%, %.1f%%, %.1f%%)\n",
1111  // float(n) / (t0 + t1 + t2 + t3),
1112  // 100.0f * t0 / (t0 + t1 + t2 + t3),
1113  // 100.0f * t1 / (t0 + t1 + t2 + t3),
1114  // 100.0f * t2 / (t0 + t1 + t2 + t3),
1115  // 100.0f * t3 / (t0 + t1 + t2 + t3));
1116 
1117  // write the sentinel offset
1118  m_new_offsets[ out_leaves ] = m_offsets[ n_leaves ] + n;
1119 
1120  // swap-in the new pages
1121  m_pages.swap( m_new_pages );
1122  m_offsets.swap( m_new_offsets );
1123  m_counters.swap( m_new_counters );
1124 
1125  // do an exclusive prefix-sum on the occurrence counters
1126  for (uint32 j = 0; j < SYMBOL_COUNT; ++j)
1127  {
1129  out_leaves+1,
1130  make_strided_iterator( &m_counters[j], SYMBOL_COUNT ),
1131  make_strided_iterator( &m_counters[j], SYMBOL_COUNT ),
1132  thrust::plus<uint64>(),
1133  uint64(0),
1134  temp_storage );
1135  }
1136 
1137  // do an error check on the occurrence counters
1138  const uint64* cnts = symbol_frequencies();
1139  uint64 n_occ = 0;
1140  for (uint32 j = 0; j < SYMBOL_COUNT; ++j)
1141  n_occ += cnts[j];
1142 
1143  if (n_occ != m_offsets[ out_leaves ])
1144  {
1145  log_error(stderr, "mismatching occurrence counters: expected %llu symbols, got %llu\n", m_offsets[ out_leaves ], n_occ );
1146  //throw runtime_error( "mismatching occurrence counters: expected %llu symbols, got %llu\n", m_offsets[ out_leaves ], n_occ );
1147  exit(1);
1148  }
1149 
1150  //if (utilization < 0.75f)
1151  // defrag();
1152 
1153  build_buckets( m_offsets.back(), (uint32)m_offsets.size(), &m_offsets[0], BUCKET_SIZE, m_buckets );
1154 }
1155 
1156 // global symbol frequencies
1157 //
1158 template <uint32 SYMBOL_SIZE_T, bool BIG_ENDIAN_T>
1160 {
1161  const uint32 LEAF_SYMBOLS = m_page_size * SYMBOLS_PER_WORD;
1162 
1163  const uint32 in_leaves = page_count();
1164  const uint64 n_symbols = size();
1165  const uint32 out_leaves = util::divide_ri( n_symbols, LEAF_SYMBOLS );
1166 
1167  log_debug(stderr, " defrag %u -> %u\n", in_leaves, out_leaves );
1168 
1169  nvbio::vector<host_tag,uint8> temp_storage;
1170 
1171  // alloc a new set of page pointers and offsets
1172  m_new_pages.resize( out_leaves );
1173  m_new_offsets.resize( out_leaves+1 );
1174  m_new_counters.resize( (out_leaves+1) * SYMBOL_COUNT, uint64(0) );
1175 
1176  const uint32 BATCH_SIZE = 4*1024;
1177 
1178  uint32 in_leaf_begin = 0;
1179 
1180  const copy_merge_pages<SYMBOL_SIZE,BIG_ENDIAN> merge_functor(
1181  n_symbols,
1182  out_leaves,
1183  in_leaves,
1184  this );
1185 
1186  // loop across output pages
1187  for (uint32 batch_begin = 0; batch_begin < out_leaves; batch_begin += BATCH_SIZE)
1188  {
1189  const uint32 batch_end = nvbio::min( out_leaves, batch_begin + BATCH_SIZE );
1190 
1191  // make sure we have enough free pages
1192  reserve_free_pages( batch_end - batch_begin );
1193 
1194  // fill the new leaves (one thread per input leaf)
1195  nvbio::for_each<host_tag>(
1196  batch_end - batch_begin,
1197  thrust::make_counting_iterator<uint32>( batch_begin ),
1198  merge_functor );
1199 
1200  // release input pages that will no longer be needed
1201  const uint32 in_leaf_end = upper_bound_index( uint64( batch_end ) * LEAF_SYMBOLS, raw_pointer( m_offsets ), in_leaves+1 ) - 1u;
1202 
1203  for (uint32 i = in_leaf_begin; i < in_leaf_end; ++i)
1204  {
1205  release_page( m_pages[i] );
1206  m_pages[i] = NULL;
1207  }
1208 
1209  in_leaf_begin = in_leaf_end;
1210  }
1211 
1212  // release any not yet released input pages
1213  for (uint32 i = in_leaf_begin; i < in_leaves; ++i)
1214  {
1215  release_page( m_pages[i] );
1216  m_pages[i] = NULL;
1217  }
1218 
1219  // write the sentinel offset
1220  m_new_offsets[ out_leaves ] = n_symbols;
1221 
1222  // swap-in the new pages
1223  m_pages.swap( m_new_pages );
1224  m_offsets.swap( m_new_offsets );
1225  m_counters.swap( m_new_counters );
1226 
1227  for (uint32 i = 0; i < page_count()-1; ++i)
1228  {
1229  const uint32 cnt = std::accumulate(
1230  &m_counters[i*SYMBOL_COUNT],
1231  &m_counters[i*SYMBOL_COUNT] + SYMBOL_COUNT, 0u );
1232 
1233  if (cnt != LEAF_SYMBOLS)
1234  log_error(stderr, "mismatching occurrence counters: at page[%u], expected %llu symbols, got %llu\n", i, LEAF_SYMBOLS, cnt );
1235  }
1236 
1237  // do an exclusive prefix-sum on the occurrence counters
1238  for (uint32 j = 0; j < SYMBOL_COUNT; ++j)
1239  {
1241  out_leaves+1,
1242  make_strided_iterator( &m_counters[j], SYMBOL_COUNT ),
1243  make_strided_iterator( &m_counters[j], SYMBOL_COUNT ),
1244  thrust::plus<uint64>(),
1245  uint64(0),
1246  temp_storage );
1247  }
1248 
1249  // do an error check on the occurrence counters
1250  const uint64* cnts = symbol_frequencies();
1251  uint64 n_occ = 0;
1252  for (uint32 j = 0; j < SYMBOL_COUNT; ++j)
1253  n_occ += cnts[j];
1254 
1255  if (n_occ != n_symbols)
1256  {
1257  log_error(stderr, "mismatching occurrence counters: expected %llu symbols, got %llu\n", n_symbols, n_occ );
1258  //throw runtime_error( "mismatching occurrence counters: expected %llu symbols, got %llu\n", n_symbols, n_occ );
1259  exit(1);
1260  }
1261 }
1262 
1263 // global symbol frequencies
1264 //
1265 template <uint32 SYMBOL_SIZE_T, bool BIG_ENDIAN_T>
1267 {
1268  return &m_counters[ page_count() * SYMBOL_COUNT ];
1269 }
1270 
1271 // find the page containing the i-th entry
1272 //
1273 template <uint32 SYMBOL_SIZE_T, bool BIG_ENDIAN_T>
1275 {
1276  const uint32 b = i >> LOG_BUCKET_SIZE;
1277  const uint32 lo = m_buckets[b];
1278  const uint32 hi = m_buckets[b+1];
1279  return upper_bound_index( i, &m_offsets[lo], hi - lo ) + lo - 1u;
1280  //return upper_bound_index( i, &m_offsets[0], (uint32)m_offsets.size() ) - 1u;
1281 }
1282 
1283 } // namespace nvbio