NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
string_set_inl.h
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #pragma once
29 
30 #include <nvbio/basic/algorithms.h>
31 #include <nvbio/basic/exceptions.h>
32 
33 #if defined(__CUDACC__)
34 
35 #include <nvbio/basic/cuda/arch.h>
36 #include <thrust/device_vector.h>
37 #include <thrust/scan.h>
38 #if THRUST_VERSION >= 100700
39 #include <thrust/execution_policy.h>
40 #endif
41 
42 #endif // defined(__CUDACC__)
43 
44 namespace nvbio {
45 
46 #if defined(__CUDACC__)
47 
48 namespace cuda {
49 
50 //
51 // A kernel to extract string lengths from a generic string set
52 //
53 template <
55  typename Iterator,
56  typename T>
57 __global__
58 void vector_init_kernel(
59  const uint32 N,
60  Iterator out,
61  T value)
62 {
63  const uint32 tid = threadIdx.x + blockIdx.x * BLOCKDIM;
64  if (tid < N)
65  out[tid] = value;
66 }
67 
68 //
69 // A kernel to extract string lengths from a generic string set
70 //
71 template <
73  typename InStringSet,
74  typename OutLengthIterator>
75 __global__
76 void generic_string_lengths_kernel(
77  const uint32 N_strings,
78  const InStringSet in_set,
79  OutLengthIterator out_lengths)
80 {
81  const uint32 tid = threadIdx.x + blockIdx.x * BLOCKDIM;
82  if (tid > N_strings) // allow to fill one more entry than N_strings
83  return;
84 
85  out_lengths[ tid ] = tid < N_strings ? in_set[tid].size() : 0u;
86 }
87 
88 //
89 // A kernel to transform a generic string set into a concatenated set
90 //
91 template <
93  typename InStringSet,
94  typename OutStringIterator,
95  typename OutOffsetIterator>
96 __global__
97 void generic_to_concat_kernel(
98  const uint32 N_strings,
99  InStringSet in_set,
100  OutStringIterator out_string,
101  OutOffsetIterator out_offsets)
102 {
103  const uint32 tid = threadIdx.x + blockIdx.x * BLOCKDIM;
104  if (tid >= N_strings)
105  return;
106 
107  typename InStringSet::string_type in_string = in_set[tid];
108 
109  const uint32 length = in_string.size();
110 
111  const uint32 offset = out_offsets[tid];
112  for (uint32 j = 0; j < length; ++j)
113  out_string[offset + j] = in_string[j];
114 }
115 
116 //
117 // A kernel to transform a sparse/concatenated string set into a concatenated set
118 // Use one warp per string.
119 //
120 template <
122  typename InStringSet,
123  typename OutStringIterator,
124  typename OutOffsetIterator>
125 __global__
126 void contig_to_concat_kernel(
127  const uint32 N_strings,
128  InStringSet in_set,
129  OutStringIterator out_string,
130  OutOffsetIterator out_offsets)
131 {
133  const uint32 wid = warp_id() + blockIdx.x * NUM_WARPS;
134  if (wid >= N_strings)
135  return;
136 
137  typename InStringSet::string_type in_string = in_set[wid];
138 
139  const uint32 length = in_string.size();
140 
141  const uint32 offset = out_offsets[wid];
142 
143  for (uint32 j = warp_tid(); j < length; j += cuda::Arch::WARP_SIZE)
144  out_string[offset + j] = in_string[j];
145 }
146 
147 //
148 // A kernel to read a bunch of values from a generic iterator and write them to another generic iterator
149 //
150 template <
152  typename InputIterator,
153  typename OutputIterator>
154 __global__
155 void generic_vector_copy_kernel(
156  const uint32 N,
157  InputIterator input,
158  OutputIterator output)
159 {
160  const uint32 tid = threadIdx.x + blockIdx.x * BLOCKDIM;
161  if (tid < N)
162  output[tid] = input[tid];
163 }
164 
165 //
166 // A kernel to transform a generic string set into a packed concatenated set.
167 // This specialization is needed because writing packed concatenated strings
168 // in parallel is not safe, as some words might be spanned by multiple strings.
169 //
170 template <
172  uint32 SYMBOL_SIZE,
173  bool BIG_ENDIAN,
174  typename InStringSet,
175  typename OutStreamIterator,
176  typename OutOffsetIterator>
177 __global__
178 void generic_to_packed_concat_kernel(
179  const uint32 N_strings,
180  const uint32 N_words,
181  InStringSet in_set,
182  OutStreamIterator out_stream,
183  OutOffsetIterator out_offsets)
184 {
185  const uint32 WARP_SIZE = cuda::Arch::WARP_SIZE;
186  const uint32 WORDS_PER_THREAD = 4u;
187 
188  const uint32 tid = threadIdx.x + blockIdx.x * BLOCKDIM;
189  if (tid * WORDS_PER_THREAD >= N_words)
190  return;
191 
192  //
193  // each thread is responsible to write out WARP_SIZE words
194  //
195  const uint32 SYMBOLS_PER_WORD = (8u*sizeof(uint32)) / SYMBOL_SIZE;
196 
197  // loop through the symbols in the given word and find out which string
198  // they belong to in the input set
199 
200  uint32 global_symbol = SYMBOLS_PER_WORD * WORDS_PER_THREAD * tid;
201  uint32 string_id = uint32( upper_bound( global_symbol, out_offsets, N_strings ) - out_offsets ) - 1u;
202  uint32 local_symbol = global_symbol - out_offsets[string_id];
203 
204  // fetch the relative input string
205  typename InStringSet::string_type in_string = in_set[string_id];
206 
207  const uint32 wid = warp_id();
208  const uint32 wtid = warp_tid();
209 
210  __shared__ volatile uint32 sm[ BLOCKDIM * WORDS_PER_THREAD ];
211  volatile uint32* warp_sm = sm + wid * WARP_SIZE * WORDS_PER_THREAD;
212 
213  for (uint32 w = 0; w < WORDS_PER_THREAD && string_id < N_strings; ++w)
214  {
215  uint32 word = 0u;
216 
217  #pragma unroll
218  for (uint32 s = 0; s < SYMBOLS_PER_WORD; ++s, ++global_symbol, ++local_symbol)
219  {
220  // compute the local position of the symbol in the input string
221  while (local_symbol >= in_string.size())
222  {
223  in_string = in_set[++string_id];
224  local_symbol = 0;
225  }
226  if (string_id == N_strings)
227  break;
228 
229  // fetch the relative character
230  const uint8 in_c = in_string[ local_symbol ];
231 
232  // and write it into the word
233  word |= (in_c << (s*SYMBOL_SIZE)); // TODO: consider endianness here
234  }
235 
236  // write out the packed word
237  warp_sm[ wtid*WORDS_PER_THREAD + w ] = word;
238  }
239 
240  // and now write the words from each thread in parallel
241  const uint32 base_offset = blockIdx.x*BLOCKDIM*WORDS_PER_THREAD +
242  WORDS_PER_THREAD * WARP_SIZE * wid;
243 
244  for (uint32 t = 0; t < WARP_SIZE; ++t)
245  {
246  if (wtid < WORDS_PER_THREAD)
247  out_stream[ base_offset + t*WORDS_PER_THREAD + wtid ] =
248  warp_sm[ t*WORDS_PER_THREAD + wtid ];
249  }
250 }
251 
252 //
253 // A kernel to transform a concatenated string set into a packed concatenated set.
254 //
255 template <
257  uint32 SYMBOL_SIZE,
258  bool BIG_ENDIAN,
259  typename InStringSet,
260  typename OutStreamIterator,
261  typename OutOffsetIterator>
262 __global__
263 void concat_to_packed_concat_kernel(
264  const uint32 N_strings,
265  const uint32 N_symbols,
266  InStringSet in_set,
267  OutStreamIterator out_stream,
268  OutOffsetIterator out_offsets)
269 {
270  const uint32 tid = threadIdx.x + blockIdx.x * BLOCKDIM;
271 
272  //
273  // each thread is responsible to read in 1 word worth of symbols
274  //
275 
276  // copy the offsets
277  if (tid <= N_strings)
278  out_offsets[tid] = in_set.offsets()[tid];
279 
280  const uint32 SYMBOLS_PER_WORD = (8u*sizeof(uint32)) / SYMBOL_SIZE;
281  if (tid * SYMBOLS_PER_WORD >= N_symbols)
282  return;
283 
284  // fetch the input symbol string
285  typename InStringSet::symbol_iterator in_symbols = in_set.base_string();
286 
287  uint32 word = 0;
288  #pragma unroll
289  for (uint32 s = 0, in_s = tid * SYMBOLS_PER_WORD; s < SYMBOLS_PER_WORD; ++s, ++in_s)
290  {
291  const uint8 in_c = in_s < N_symbols ? in_symbols[ in_s ] : 0u;
292  word |= in_c << (s*SYMBOL_SIZE); // TODO: handle Endianness here
293  }
294 
295  // write the packed word out
296  out_stream[ tid ] = word;
297 }
298 
299 //
300 // A kernel to transform a generic string set into a strided set
301 //
302 template <
304  typename InStringSet,
305  typename OutStringIterator,
306  typename OutLengthIterator>
307 __global__
308 void generic_to_strided_kernel(
309  const uint32 N_strings,
310  const uint32 out_stride,
311  InStringSet in_set,
312  OutStringIterator out_stream,
313  OutLengthIterator out_lengths)
314 {
315  const uint32 tid = threadIdx.x + blockIdx.x * BLOCKDIM;
316  if (tid >= N_strings)
317  return;
318 
319  typename InStringSet::string_type in_string = in_set[tid];
320 
321  const uint32 length = in_string.size();
322 
323  typedef strided_iterator<OutStringIterator> strided_stream_type;
324 
325  strided_stream_type out_string( out_stream + tid, out_stride );
326 
327  for (uint32 j = 0; j < length; ++j)
328  out_string[j] = in_string[j];
329 
330  if (tid < N_strings)
331  out_lengths[tid] = length;
332 }
333 
334 //
335 // A kernel to transform a packed-sparse string set into a strided set
336 //
337 template <
339  uint32 BITS,
340  bool BIG_ENDIAN,
341  typename InStreamIterator,
342  typename InOffsetIterator,
343  typename OutStringIterator,
344  typename OutLengthIterator>
345 __global__
346 void packed_concat_to_strided_kernel(
347  const uint32 N_strings,
348  const uint32 out_stride,
349  InStreamIterator in_stream,
350  InOffsetIterator in_offsets,
351  OutStringIterator out_stream,
352  OutLengthIterator out_lengths)
353 {
354  const uint32 tid = threadIdx.x + blockIdx.x * BLOCKDIM;
355  if (tid >= N_strings)
356  return;
357 
358  const uint32 in_offset = in_offsets[tid];
359  const uint32 N = in_offsets[tid+1] - in_offset;
360 
361 #if 0
362  typedef typename std::iterator_traits<InStreamIterator>::value_type word_type;
363 
364  const uint32 SYMBOLS_PER_WORD = (sizeof(word_type)*8) / BITS;
365  uint32 begin_word = in_offset / SYMBOLS_PER_WORD;
366  uint32 end_word = (in_offset + N + SYMBOLS_PER_WORD-1) / SYMBOLS_PER_WORD;
367  uint32 word_offset = in_offset & (SYMBOLS_PER_WORD-1);
368 
369  // load the words of the input stream in local memory with a tight loop
370  uint32 lmem[64];
371  for (uint32 word = begin_word; word < end_word; ++word)
372  lmem[word - begin_word] = in_stream[ word ];
373 
374  typedef PackedStream<const_cached_iterator<const uint32*>,uint8,BITS,BIG_ENDIAN> const_stream_type;
375  const_stream_type clmem_stream( &lmem[0] );
376 
377  // write out the output symbols
378  for (uint32 i = 0; i < N; ++i)
379  out_stream[ tid + out_stride*i ] = clmem_stream[i + word_offset];
380 #else
381  // Naive
382  typedef PackedStream<const_cached_iterator<InStreamIterator>,uint8,BITS,BIG_ENDIAN> const_stream_type;
383  const_stream_type cstream( in_stream );
384 
385  for (uint32 i = 0; i < N; ++i)
386  out_stream[ tid + out_stride*i ] = cstream[i + in_offset];
387 #endif
388  out_lengths[tid] = N;
389 }
390 
391 //
392 // A kernel to transform a packed-sparse string set into a strided set
393 //
394 template <
396  uint32 BITS,
397  bool BIG_ENDIAN,
398  typename InStreamIterator,
399  typename InRangeIterator,
400  typename OutStringIterator,
401  typename OutLengthIterator>
402 __global__
403 void packed_sparse_to_strided_kernel(
404  const uint32 N_strings,
405  const uint32 out_stride,
406  InStreamIterator in_stream,
407  InRangeIterator in_ranges,
408  OutStringIterator out_stream,
409  OutLengthIterator out_lengths)
410 {
411  const uint32 tid = threadIdx.x + blockIdx.x * BLOCKDIM;
412  if (tid >= N_strings)
413  return;
414 
415  const uint2 range = in_ranges[tid];
416  const uint32 in_offset = range.x;
417  const uint32 N = range.y - in_offset;
418 
419 #if 1
420  typedef typename std::iterator_traits<InStreamIterator>::value_type word_type;
421 
422  const uint32 SYMBOLS_PER_WORD = (sizeof(word_type)*8) / BITS;
423  uint32 word_offset = in_offset & (SYMBOLS_PER_WORD-1);
424  uint32 begin_word = in_offset / SYMBOLS_PER_WORD;
425  uint32 end_word = (in_offset + N + SYMBOLS_PER_WORD-1) / SYMBOLS_PER_WORD;
426 
427  word_type lmem[64];
428  for (uint32 word = begin_word; word < end_word; ++word)
429  lmem[word - begin_word] = in_stream[ word ];
430 
431  typedef PackedStream<const_cached_iterator<const word_type*>,uint8,BITS,BIG_ENDIAN> const_stream_type;
432  const_stream_type clmem_stream( &lmem[0] );
433 
434  // write out the output symbols
435  for (uint32 i = 0; i < N; ++i)
436  out_stream[ tid + out_stride*i ] = clmem_stream[i + word_offset];
437 #else
438  // Naive
439  typedef PackedStream<const_cached_iterator<InStreamIterator>,uint8,BITS,BIG_ENDIAN> const_stream_type;
440  const_stream_type cstream( in_stream );
441 
442  for (uint32 i = 0; i < N; ++i)
443  out_stream[ tid + out_stride*i ] = cstream[i + in_offset];
444 #endif
445  out_lengths[tid] = N;
446 }
447 
448 //
449 // A kernel to transform a contiguous string set into a strided set
450 //
451 template <
453  typename InStringSet,
454  typename OutLengthIterator>
455 __global__
456 void contig_to_strided_uint8_kernel(
457  const uint32 N_strings,
458  const uint32 out_stride,
459  InStringSet in_set,
460  uint8* out_stream,
461  OutLengthIterator out_lengths)
462 {
463  const uint32 tid = threadIdx.x + blockIdx.x * BLOCKDIM;
464 
465  const uint32 length = tid < N_strings ? in_set[ tid ].size() : 0u;
466 
467  const uint32 WARP_SIZE = cuda::Arch::WARP_SIZE;
468  const uint32 wid = warp_id();
469  const uint32 wtid = warp_tid();
470 
471  __shared__ volatile uint8 sm[ BLOCKDIM * WARP_SIZE ];
472  volatile uint8* warp_sm = sm + wid * WARP_SIZE * WARP_SIZE;
473 
474 
475  // each warp fetches WARP_SIZE characters from WARP_SIZE strings
476  for (uint32 block = 0; __any( block < length ); block += WARP_SIZE)
477  {
478  for (uint32 t = 0; t < WARP_SIZE; ++t)
479  {
480  // compute the t-th string id
481  const uint32 t_string_id = blockIdx.x*BLOCKDIM + wid*WARP_SIZE + t;
482  if (t_string_id >= N_strings)
483  break;
484 
485  // fetch the t-th string
486  typename InStringSet::string_type t_string = in_set[ t_string_id ];
487 
488  // read 1 symbol per thread
489  warp_sm[ wtid*WARP_SIZE + t ] = (block + wtid < t_string.size()) ? t_string[block + wtid] : 0u;
490  }
491 
492  // at this point we have WARP_SIZE characters from WARP_SIZE adjacent strings in warp_sm: let's write them out
493  if (block + WARP_SIZE <= length)
494  {
495  for (uint32 s = 0; s < WARP_SIZE; ++s)
496  out_stream[ tid + (block + s)*out_stride ] = warp_sm[ s*WARP_SIZE + wtid ];
497  }
498  else if (block < length)
499  {
500  for (uint32 s = 0; s < WARP_SIZE; ++s)
501  {
502  if (block + s < length)
503  out_stream[ tid + (block + s)*out_stride ] = warp_sm[ s*WARP_SIZE + wtid ];
504  }
505  }
506  }
507 
508  // write out the length
509  if (tid < N_strings)
510  out_lengths[tid] = length;
511 }
512 
513 //
514 // A kernel to transform a generic string set into a strided set
515 //
516 template <
518  typename InStringSet,
519  typename OutLengthIterator>
520 __global__
521 void generic_to_strided_uint8_kernel(
522  const uint32 N_strings,
523  const uint32 out_stride,
524  InStringSet in_set,
525  uint8* out_stream,
526  OutLengthIterator out_lengths)
527 {
528  const uint32 tid = threadIdx.x + blockIdx.x * BLOCKDIM;
529  const uint32 base_id = tid*4;
530  if (base_id >= N_strings)
531  return;
532 
533  typedef strided_iterator<uint32*> strided_stream_type;
534 
535  strided_stream_type out_string( (uint32*)(out_stream) + tid, out_stride/4u );
536 
537  if (base_id + 3 < N_strings)
538  {
539  // fetch 4 strings
540  typename InStringSet::string_type in_string0 = in_set[base_id + 0];
541  typename InStringSet::string_type in_string1 = in_set[base_id + 1];
542  typename InStringSet::string_type in_string2 = in_set[base_id + 2];
543  typename InStringSet::string_type in_string3 = in_set[base_id + 3];
544 
545  const uint32 length0 = in_string0.size();
546  const uint32 length1 = in_string1.size();
547  const uint32 length2 = in_string2.size();
548  const uint32 length3 = in_string3.size();
549 
550  const uint32 min_length = nvbio::min(
551  nvbio::min( length0, length1 ),
552  nvbio::min( length2, length3 ) );
553 
554  for (uint32 j = 0; j < min_length; ++j)
555  {
556  const uint32 c0 = in_string0[j];
557  const uint32 c1 = in_string1[j];
558  const uint32 c2 = in_string2[j];
559  const uint32 c3 = in_string3[j];
560 
561  out_string[j] = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24);
562  }
563 
564  const uint32 max_length = nvbio::max(
565  nvbio::max( length0, length1 ),
566  nvbio::max( length2, length3 ) );
567 
568  for (uint32 j = min_length; j < max_length; ++j)
569  {
570  const uint32 c0 = j < length0 ? in_string0[j] : 0xFFu;
571  const uint32 c1 = j < length1 ? in_string1[j] : 0xFFu;
572  const uint32 c2 = j < length2 ? in_string2[j] : 0xFFu;
573  const uint32 c3 = j < length3 ? in_string3[j] : 0xFFu;
574 
575  out_string[j] = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24);
576  }
577 
578  out_lengths[base_id] = length0;
579  out_lengths[base_id + 1] = length1;
580  out_lengths[base_id + 2] = length2;
581  out_lengths[base_id + 3] = length3;
582  }
583  else if (base_id + 2 < N_strings)
584  {
585  // fetch 3 strings
586  typename InStringSet::string_type in_string0 = in_set[base_id + 0];
587  typename InStringSet::string_type in_string1 = in_set[base_id + 1];
588  typename InStringSet::string_type in_string2 = in_set[base_id + 2];
589 
590  const uint32 length0 = in_string0.size();
591  const uint32 length1 = in_string1.size();
592  const uint32 length2 = in_string2.size();
593 
594  const uint32 min_length = nvbio::min3( length0, length1, length2 );
595  for (uint32 j = 0; j < min_length; ++j)
596  {
597  const uint32 c0 = in_string0[j];
598  const uint32 c1 = in_string1[j];
599  const uint32 c2 = in_string2[j];
600  const uint32 c3 = 0xFFu;
601 
602  out_string[j] = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24);
603  }
604 
605  const uint32 max_length = nvbio::max3( length0, length1, length2 );
606  for (uint32 j = min_length; j < max_length; ++j)
607  {
608  const uint32 c0 = j < length0 ? in_string0[j] : 0xFFu;
609  const uint32 c1 = j < length1 ? in_string1[j] : 0xFFu;
610  const uint32 c2 = j < length2 ? in_string2[j] : 0xFFu;
611  const uint32 c3 = 0xFFu;
612 
613  out_string[j] = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24);
614  }
615 
616  out_lengths[base_id] = length0;
617  out_lengths[base_id + 1] = length1;
618  out_lengths[base_id + 2] = length2;
619  }
620  else if (base_id + 1 < N_strings)
621  {
622  // fetch 2 strings
623  typename InStringSet::string_type in_string0 = in_set[base_id + 0];
624  typename InStringSet::string_type in_string1 = in_set[base_id + 1];
625 
626  const uint32 length0 = in_string0.size();
627  const uint32 length1 = in_string1.size();
628 
629  const uint32 min_length = nvbio::min( length0, length1 );
630  for (uint32 j = 0; j < min_length; ++j)
631  {
632  const uint32 c0 = in_string0[j];
633  const uint32 c1 = in_string1[j];
634  const uint32 c2 = 0xFFu;
635  const uint32 c3 = 0xFFu;
636 
637  out_string[j] = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24);
638  }
639 
640  const uint32 max_length = nvbio::max( length0, length1 );
641  for (uint32 j = min_length; j < max_length; ++j)
642  {
643  const uint32 c0 = j < length0 ? in_string0[j] : 0xFFu;
644  const uint32 c1 = j < length1 ? in_string1[j] : 0xFFu;
645  const uint32 c2 = 0xFFu;
646  const uint32 c3 = 0xFFu;
647 
648  out_string[j] = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24);
649  }
650 
651  out_lengths[base_id] = length0;
652  out_lengths[base_id + 1] = length1;
653  }
654  else
655  {
656  // fetch 1 string
657  typename InStringSet::string_type in_string = in_set[base_id + 0];
658 
659  const uint32 length = in_string.size();
660 
661  for (uint32 j = 0; j < length; ++j)
662  {
663  const uint32 c0 = in_string[j];
664  const uint32 c1 = 0xFFu;
665  const uint32 c2 = 0xFFu;
666  const uint32 c3 = 0xFFu;
667 
668  out_string[j] = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24);
669  }
670 
671  out_lengths[base_id] = length;
672  }
673 }
674 
675 //
676 // A kernel to transform a strided-packed string set into a strided set
677 //
678 template <
680  typename InStringSet,
681  typename OutLengthIterator>
682 __global__
683 void strided_packed_to_strided_uint8_kernel(
684  const uint32 N_strings,
685  const uint32 out_stride,
686  InStringSet in_set,
687  uint8* out_stream,
688  OutLengthIterator out_lengths)
689 {
690  const uint32 tid = blockIdx.x * BLOCKDIM + threadIdx.x;
691  const uint32 id = tid * 4;
692  if (id >= N_strings)
693  return;
694 
695  __shared__ uint32 sm[ BLOCKDIM*4 ];
696 
697  typedef PackedStream<
698  uint32*,
699  typename InStringSet::symbol_type,
700  InStringSet::SYMBOL_SIZE,
701  InStringSet::BIG_ENDIAN> stream_type;
702 
703  const uint32 SYMBOLS_PER_WORD = (sizeof(uint32)*8u) / InStringSet::SYMBOL_SIZE;
704 
705  const uint32 length0 = in_set.lengths()[id + 0];
706  const uint32 length1 = in_set.lengths()[id + 1];
707  const uint32 length2 = in_set.lengths()[id + 2];
708  const uint32 length3 = in_set.lengths()[id + 3];
709 
710  const uint32 N = nvbio::max(
711  nvbio::max( length0, length1 ),
712  nvbio::max( length2, length3 ) );
713 
714  const uint32 N_words = (N + SYMBOLS_PER_WORD-1) / SYMBOLS_PER_WORD;
715 
716  const uint32* in_stream = in_set.base_stream();
717  const uint32 in_stride = in_set.stride();
718 
719  for (uint32 i = 0; i < N_words; ++i)
720  {
721  // fetch 4 words, 1 from each stream
722  const uint4 in_words = *reinterpret_cast<const uint4*>( in_stream + i * in_stride + id );
723 
724  // setup 4 streams on the words just read
725  sm[threadIdx.x + BLOCKDIM*0] = in_words.x;
726  sm[threadIdx.x + BLOCKDIM*1] = in_words.y;
727  sm[threadIdx.x + BLOCKDIM*2] = in_words.z;
728  sm[threadIdx.x + BLOCKDIM*3] = in_words.w;
729 
730  stream_type streams[4] =
731  {
732  stream_type( &sm[threadIdx.x + BLOCKDIM*0] ),
733  stream_type( &sm[threadIdx.x + BLOCKDIM*1] ),
734  stream_type( &sm[threadIdx.x + BLOCKDIM*2] ),
735  stream_type( &sm[threadIdx.x + BLOCKDIM*3] )
736  };
737 
738  // write out all the symbols packed in the fetched words interleaving them as uint8's in a simd4u8.
739  for (uint32 j = 0; j < SYMBOLS_PER_WORD; ++j)
740  {
741  // read the next symbol from each of the 4 streams
742  const uint32 word =
743  (streams[0][j] << 0) |
744  (streams[1][j] << 8) |
745  (streams[2][j] << 16) |
746  (streams[3][j] << 24);
747 
748  // and write the newly packed word out
749  *reinterpret_cast<uint32*>( out_stream + (i*SYMBOLS_PER_WORD + j)*out_stride + id ) = word;
750  }
751  }
752 
753  out_lengths[id] = length0;
754  out_lengths[id + 1] = length1;
755  out_lengths[id + 2] = length2;
756  out_lengths[id + 3] = length3;
757 }
758 
759 //
760 // A kernel to transform a generic string set into a strided packed set
761 //
762 template <
763  uint32 BLOCKDIM,
764  uint32 SYMBOL_SIZE,
765  bool BIG_ENDIAN,
766  typename InStringSet,
767  typename OutStreamIterator,
768  typename OutLengthIterator>
769 __global__
770 void generic_to_strided_packed_kernel(
771  const uint32 N_strings,
772  const uint32 out_stride,
773  InStringSet in_set,
774  OutStreamIterator out_stream,
775  OutLengthIterator out_lengths)
776 {
777  const uint32 tid = threadIdx.x + blockIdx.x * BLOCKDIM;
778  if (tid >= N_strings)
779  return;
780 
781  typename InStringSet::string_type in_string = in_set[tid];
782 
783  const uint32 length = in_string.size();
784 
785  typedef strided_iterator<OutStreamIterator> strided_stream_type;
786  typedef PackedStream<strided_stream_type,uint8,SYMBOL_SIZE,BIG_ENDIAN> packed_stream_type;
787 
788  packed_stream_type out_string( strided_stream_type( out_stream + tid, out_stride ) );
789 
790  for (uint32 j = 0; j < length; ++j)
791  out_string[j] = in_string[j];
792 
793  if (tid < N_strings)
794  out_lengths[tid] = length;
795 }
796 
797 //
798 // A kernel to transform a contiguous string set into a strided packed set
799 //
800 template <
801  uint32 BLOCKDIM,
802  uint32 SYMBOL_SIZE,
803  bool BIG_ENDIAN,
804  typename InStringSet,
805  typename OutStreamIterator,
806  typename OutLengthIterator>
807 __global__
808 void contig_to_strided_packed_kernel(
809  const uint32 N_strings,
810  const uint32 out_stride,
811  InStringSet in_set,
812  OutStreamIterator out_stream,
813  OutLengthIterator out_lengths)
814 {
815  const uint32 tid = threadIdx.x + blockIdx.x * BLOCKDIM;
816 
817  const uint32 length = tid < N_strings ? in_set[ tid ].size() : 0u;
818 
819  const uint32 WARP_SIZE = cuda::Arch::WARP_SIZE;
820  const uint32 wid = warp_id();
821  const uint32 wtid = warp_tid();
822 
823  __shared__ volatile uint8 sm[ BLOCKDIM * WARP_SIZE ];
824  volatile uint8* warp_sm = sm + wid * WARP_SIZE * WARP_SIZE;
825 
826  typedef strided_iterator<OutStreamIterator> strided_stream_type;
827  typedef PackedStream<strided_stream_type,uint8,SYMBOL_SIZE,BIG_ENDIAN> packed_stream_type;
828 
829  packed_stream_type out_string( strided_stream_type( out_stream + tid, out_stride ) );
830 
831 
832  // each warp fetches WARP_SIZE characters from WARP_SIZE strings
833  for (uint32 block = 0; __any( block < length ); block += WARP_SIZE)
834  {
835  for (uint32 t = 0; t < WARP_SIZE; ++t)
836  {
837  // compute the t-th string id
838  const uint32 t_string_id = blockIdx.x*BLOCKDIM + wid*WARP_SIZE + t;
839  if (t_string_id >= N_strings)
840  break;
841 
842  // fetch the t-th string
843  typename InStringSet::string_type t_string = in_set[ t_string_id ];
844 
845  // read 1 symbol per thread
846  warp_sm[ t*WARP_SIZE + wtid ] = (block + wtid < t_string.size()) ? t_string[block + wtid] : 0u;
847  }
848 
849  // at this point we have WARP_SIZE characters from WARP_SIZE adjacent strings in warp_sm: let's write them out
850  if (block < length)
851  {
852  //for (uint32 s = 0; s < WARP_SIZE; ++s)
853  //{
854  // if (block + s < length)
855  // out_string[ block + s ] = warp_sm[ wtid*WARP_SIZE + s ];
856  //}
857 
858  // pack the symbols in a word in a register before writing it out
859  uint32 word;
860  PackedStream<uint32*,uint8,SYMBOL_SIZE,BIG_ENDIAN> packed_word( &word );
861 
862  const uint32 SYMBOLS_PER_WORD = (8u*sizeof(uint32))/SYMBOL_SIZE;
863  for (uint32 s = 0; s < WARP_SIZE; s += SYMBOLS_PER_WORD)
864  {
865  if (block + s < length)
866  {
867  for (uint32 b = 0; b < SYMBOLS_PER_WORD; ++b)
868  packed_word[b] = warp_sm[ wtid*WARP_SIZE + s + b ];
869 
870  const uint32 word_idx = (block + s) / SYMBOLS_PER_WORD;
871  out_stream[ tid + word_idx*out_stride ] = word;
872  }
873  }
874  }
875  }
876 
877  // write out the length
878  if (tid < N_strings)
879  out_lengths[tid] = length;
880 }
881 
882 //
883 // A kernel to transform a packed concatenated string set into a strided one
884 //
885 template <
886  uint32 BLOCKDIM,
887  uint32 SYMBOL_SIZE,
888  bool BIG_ENDIAN,
889  typename InStreamIterator,
890  typename InOffsetIterator,
891  typename OutStreamIterator,
892  typename OutLengthIterator>
893 __global__
894 void packed_concatenated_to_strided_packed_kernel(
895  const uint32 N_strings,
896  const uint32 out_stride,
897  InStreamIterator in_stream,
898  InOffsetIterator in_offsets,
899  OutStreamIterator out_stream,
900  OutLengthIterator out_lengths)
901 {
902  const uint32 tid = threadIdx.x + blockIdx.x * BLOCKDIM;
903  if (tid >= N_strings)
904  return;
905 
906  const uint32 offset = in_offsets[tid];
907  const uint32 length = in_offsets[tid+1] - offset;
908 
909  transpose_packed_streams<BLOCKDIM,SYMBOL_SIZE,BIG_ENDIAN>(
910  out_stride,
911  length,
912  offset,
913  in_stream,
914  out_stream + tid );
915 
916  out_lengths[tid] = length;
917 }
918 
919 //
920 // A kernel to transform a packed sparse string set into a strided one
921 //
922 template <
923  uint32 BLOCKDIM,
924  uint32 SYMBOL_SIZE,
925  bool BIG_ENDIAN,
926  typename InStreamIterator,
927  typename InRangeIterator,
928  typename OutStreamIterator,
929  typename OutLengthIterator>
930 __global__
931 void packed_sparse_to_strided_packed_kernel(
932  const uint32 N_strings,
933  const uint32 out_stride,
934  InStreamIterator in_stream,
935  InRangeIterator in_ranges,
936  OutStreamIterator out_stream,
937  OutLengthIterator out_lengths)
938 {
939  const uint32 tid = threadIdx.x + blockIdx.x * BLOCKDIM;
940  if (tid >= N_strings)
941  return;
942 
943  const uint2 range = in_ranges[tid];
944  const uint32 offset = range.x;
945  const uint32 length = range.y - range.x;
946 
947  transpose_packed_streams<BLOCKDIM,SYMBOL_SIZE,BIG_ENDIAN>(
948  out_stride,
949  length,
950  offset,
951  in_stream,
952  out_stream + tid );
953 
954  out_lengths[tid] = length;
955 }
956 
957 template <typename OutStringSet>
958 struct copy_dispatch
959 {
960  template <typename InStringSet>
961  struct source_dispatch
962  {
963  static void enact(
964  const InStringSet& in_string_set,
965  OutStringSet& out_string_set)
966  {
967  }
968  };
969 };
970 
971 //
972 // concatenated output set
973 //
974 template <
975  typename OutStringIterator,
976  typename OutOffsetIterator>
977 struct copy_dispatch<
978  ConcatenatedStringSet<OutStringIterator,OutOffsetIterator>
979  >
980 {
981  typedef ConcatenatedStringSet<OutStringIterator,OutOffsetIterator> out_string_set_type;
982 
983  // generic input set
984  template <typename in_string_set_type>
985  struct source_dispatch
986  {
987  static void enact(
988  const in_string_set_type& in_string_set,
989  out_string_set_type& out_string_set)
990  {
991  const uint32 BLOCKDIM = 64u;
992 
993  if (out_string_set.size() != in_string_set.size())
994  throw nvbio::runtime_error( "copy() : unmatched string set sizes" );
995 
996  const uint32 n_blocks = (in_string_set.size()+1 + BLOCKDIM-1)/BLOCKDIM;
997 
998  // extract the string lengths
999  generic_string_lengths_kernel<BLOCKDIM> <<<n_blocks,BLOCKDIM>>>(
1000  in_string_set.size(),
1001  in_string_set,
1002  out_string_set.offsets() );
1003 
1004  cudaThreadSynchronize();
1005 
1006  #if THRUST_VERSION <= 100503
1007  // perform an exclusive scan on the string lengths to get the offsets
1009  out_string_set.offsets(),
1010  out_string_set.offsets() + in_string_set.size()+1,
1011  out_string_set.offsets(),
1012  0u,
1013  add_functor() );
1014  #else
1015  // perform an exclusive scan on the string lengths to get the offsets
1017  thrust::device,
1018  out_string_set.offsets(),
1019  out_string_set.offsets() + in_string_set.size()+1,
1020  out_string_set.offsets(),
1021  0u,
1022  add_functor() );
1023  #endif
1024 
1025  generic_to_concat_kernel<BLOCKDIM> <<<n_blocks,BLOCKDIM>>>(
1026  in_string_set.size(),
1027  in_string_set,
1028  out_string_set.base_string(),
1029  out_string_set.offsets() );
1030 
1031  cudaThreadSynchronize();
1032  }
1033  };
1034 
1035  // sparse input set
1036  template <
1037  typename InStringIterator,
1038  typename InRangeIterator>
1039  struct source_dispatch< SparseStringSet<InStringIterator,InRangeIterator> >
1040  {
1041  typedef SparseStringSet<InStringIterator,InRangeIterator> in_string_set_type;
1042 
1043  static void enact(
1044  const in_string_set_type& in_string_set,
1045  out_string_set_type& out_string_set)
1046  {
1047  const uint32 BLOCKDIM = 128u;
1048 
1049  if (out_string_set.size() != in_string_set.size())
1050  throw nvbio::runtime_error( "copy() : unmatched string set sizes" );
1051 
1052  const uint32 n_blocks = (in_string_set.size()+1 + BLOCKDIM-1)/BLOCKDIM;
1053 
1054  // extract the string lengths
1055  generic_string_lengths_kernel<BLOCKDIM> <<<n_blocks,BLOCKDIM>>>(
1056  in_string_set.size(),
1057  in_string_set,
1058  out_string_set.offsets() );
1059 
1060  cudaThreadSynchronize();
1061 
1062  #if THRUST_VERSION <= 100503
1063  // perform an exclusive scan on the string lengths to get the offsets
1065  out_string_set.offsets(),
1066  out_string_set.offsets() + in_string_set.size()+1,
1067  out_string_set.offsets(),
1068  0u,
1069  add_functor() );
1070  #else
1071  // perform an exclusive scan on the string lengths to get the offsets
1073  thrust::device,
1074  out_string_set.offsets(),
1075  out_string_set.offsets() + in_string_set.size()+1,
1076  out_string_set.offsets(),
1077  0u,
1078  add_functor() );
1079  #endif
1080  {
1081  // use 1 warp per string
1082  const uint32 WARPS_PER_BLOCK = BLOCKDIM >> cuda::Arch::LOG_WARP_SIZE;
1083  const uint32 n_blocks = (in_string_set.size()+1 + WARPS_PER_BLOCK-1)/WARPS_PER_BLOCK;
1084 
1085  contig_to_concat_kernel<BLOCKDIM> <<<n_blocks,BLOCKDIM>>>(
1086  in_string_set.size(),
1087  in_string_set,
1088  out_string_set.base_string(),
1089  out_string_set.offsets() );
1090 
1091  cudaThreadSynchronize();
1092  }
1093  }
1094  };
1095 
1096  template <typename in_string_set_type>
1097  static void enact(
1098  const in_string_set_type& in_string_set,
1099  out_string_set_type& out_string_set)
1100  {
1101  return source_dispatch<in_string_set_type>::enact( in_string_set, out_string_set );
1102  }
1103 };
1104 
1105 //
1106 // packed-concatenated output set
1107 //
1108 template <
1109  typename SymbolType,
1110  uint32 SYMBOL_SIZE_T,
1111  bool BIG_ENDIAN_T,
1112  typename OutStreamIterator,
1113  typename OutOffsetIterator>
1114 struct copy_dispatch<
1115  ConcatenatedStringSet<
1116  PackedStream<OutStreamIterator,SymbolType,SYMBOL_SIZE_T,BIG_ENDIAN_T>,
1117  OutOffsetIterator >
1118  >
1119 {
1120  typedef ConcatenatedStringSet<
1121  PackedStream<OutStreamIterator,SymbolType,SYMBOL_SIZE_T,BIG_ENDIAN_T>,
1122  OutOffsetIterator >
1123  out_string_set_type;
1124 
1125  // generic input set
1126  template <typename in_string_set_type>
1127  struct source_dispatch
1128  {
1129  static void enact(
1130  const in_string_set_type& in_string_set,
1131  out_string_set_type& out_string_set)
1132  {
1133  const uint32 BLOCKDIM = 64u;
1134 
1135  if (out_string_set.size() != in_string_set.size())
1136  throw nvbio::runtime_error( "copy() : unmatched string set sizes" );
1137 
1138  {
1139  const uint32 n_blocks = (in_string_set.size()+1 + BLOCKDIM-1)/BLOCKDIM;
1140 
1141  // extract the string lengths
1142  generic_string_lengths_kernel<BLOCKDIM> <<<n_blocks,BLOCKDIM>>>(
1143  in_string_set.size(),
1144  in_string_set,
1145  out_string_set.offsets() );
1146 
1147  cudaThreadSynchronize();
1148  }
1149  #if THRUST_VERSION <= 100503
1150  // perform an exclusive scan on the string lengths to get the offsets
1152  out_string_set.offsets(),
1153  out_string_set.offsets() + in_string_set.size()+1,
1154  out_string_set.offsets(),
1155  0u,
1156  add_functor() );
1157  #else
1158  // perform an exclusive scan on the string lengths to get the offsets
1160  thrust::device,
1161  out_string_set.offsets(),
1162  out_string_set.offsets() + in_string_set.size()+1,
1163  out_string_set.offsets(),
1164  0u,
1165  add_functor() );
1166  #endif
1167  // extract the total string set length from the offset vector
1168  thrust::device_vector<uint32> d_total_length(1);
1169 
1170  generic_vector_copy_kernel<BLOCKDIM> <<<1,1>>> (
1171  1u,
1172  out_string_set.offsets() + in_string_set.size(),
1173  thrust::raw_pointer_cast( &d_total_length.front() ) );
1174 
1175  cudaThreadSynchronize();
1176 
1177  const uint32 SYMBOLS_PER_WORD = (8u*sizeof(uint32)) / SYMBOL_SIZE_T;
1178  const uint32 N_symbols = d_total_length[0];
1179  const uint32 N_words = (N_symbols + SYMBOLS_PER_WORD-1) / SYMBOLS_PER_WORD;
1180  {
1181  const uint32 n_blocks = (N_words + BLOCKDIM-1)/BLOCKDIM;
1182 
1183  generic_to_packed_concat_kernel<BLOCKDIM,SYMBOL_SIZE_T,BIG_ENDIAN_T> <<<n_blocks,BLOCKDIM>>>(
1184  in_string_set.size(),
1185  N_words,
1186  in_string_set,
1187  out_string_set.base_string().stream(),
1188  out_string_set.offsets() );
1189 
1190  cudaThreadSynchronize();
1191  }
1192  }
1193  };
1194 
1195  // concatenated input set
1196  template <
1197  typename InStringIterator,
1198  typename InOffsetIterator>
1199  struct source_dispatch<
1200  ConcatenatedStringSet<
1201  InStringIterator,
1202  InOffsetIterator>
1203  >
1204  {
1205  typedef ConcatenatedStringSet<
1206  InStringIterator,
1207  InOffsetIterator>
1208  in_string_set_type;
1209 
1210  static void enact(
1211  const in_string_set_type& in_string_set,
1212  out_string_set_type& out_string_set)
1213  {
1214  const uint32 BLOCKDIM = 64u;
1215 
1216  if (out_string_set.size() != in_string_set.size())
1217  throw nvbio::runtime_error( "copy() : unmatched string set sizes" );
1218 
1219  // extract the total string set length from the offset vector
1220  thrust::device_vector<uint32> d_total_length(1);
1221 
1222  generic_vector_copy_kernel<BLOCKDIM> <<<1,1>>> (
1223  1u,
1224  in_string_set.offsets() + in_string_set.size(),
1225  thrust::raw_pointer_cast( &d_total_length.front() ) );
1226 
1227  cudaThreadSynchronize();
1228 
1229  const uint32 SYMBOLS_PER_WORD = (8u*sizeof(uint32)) / SYMBOL_SIZE_T;
1230  const uint32 N_symbols = d_total_length[0];
1231  const uint32 N_words = (N_symbols + SYMBOLS_PER_WORD-1) / SYMBOLS_PER_WORD;
1232  {
1233  const uint32 n_blocks = (N_words + BLOCKDIM-1)/BLOCKDIM;
1234 
1235  concat_to_packed_concat_kernel<BLOCKDIM,SYMBOL_SIZE_T,BIG_ENDIAN_T> <<<n_blocks,BLOCKDIM>>>(
1236  in_string_set.size(),
1237  N_symbols,
1238  in_string_set,
1239  out_string_set.base_string().stream(),
1240  out_string_set.offsets() );
1241 
1242  cudaThreadSynchronize();
1243  }
1244  }
1245  };
1246 
1247  template <typename in_string_set_type>
1248  static void enact(
1249  const in_string_set_type& in_string_set,
1250  out_string_set_type& out_string_set)
1251  {
1252  return source_dispatch<in_string_set_type>::enact( in_string_set, out_string_set );
1253  }
1254 };
1255 
1256 //
1257 // strided output set
1258 //
1259 template <
1260  typename OutStreamIterator,
1261  typename OutLengthIterator>
1262 struct copy_dispatch<
1263  StridedStringSet<
1264  OutStreamIterator,
1265  OutLengthIterator>
1266  >
1267 {
1268  typedef StridedStringSet<OutStreamIterator, OutLengthIterator> out_string_set_type;
1269 
1270  // generic input set
1271  template <typename in_string_set_type>
1272  struct source_dispatch
1273  {
1274  static void enact(
1275  const in_string_set_type& in_string_set,
1276  out_string_set_type& out_string_set)
1277  {
1278  const uint32 BLOCKDIM = 64u;
1279 
1280  if (out_string_set.size() != in_string_set.size() ||
1281  out_string_set.stride() < out_string_set.size())
1282  throw nvbio::runtime_error( "copy() : unmatched string set sizes" );
1283 
1284  const uint32 n_blocks = (in_string_set.size() + BLOCKDIM-1)/BLOCKDIM;
1285 
1286  generic_to_strided_kernel<BLOCKDIM> <<<n_blocks,BLOCKDIM>>>(
1287  in_string_set.size(),
1288  out_string_set.stride(),
1289  in_string_set,
1290  out_string_set.base_string(),
1291  out_string_set.lengths() );
1292 
1293  cudaThreadSynchronize();
1294  }
1295  };
1296 
1297  // packed-concat input set
1298  template <
1299  typename InStreamIterator,
1300  typename SymbolType,
1301  uint32 SYMBOL_SIZE_T,
1302  bool BIG_ENDIAN_T,
1303  typename InOffsetIterator>
1304  struct source_dispatch<
1305  ConcatenatedStringSet<
1306  PackedStream<InStreamIterator,SymbolType,SYMBOL_SIZE_T,BIG_ENDIAN_T>,
1307  InOffsetIterator>
1308  >
1309  {
1310  typedef ConcatenatedStringSet<
1311  PackedStream<InStreamIterator,SymbolType,SYMBOL_SIZE_T,BIG_ENDIAN_T>,
1312  InOffsetIterator> in_string_set_type;
1313 
1314  static void enact(
1315  const in_string_set_type& in_string_set,
1316  const out_string_set_type& out_string_set)
1317  {
1318  const uint32 BLOCKDIM = 64u;
1319 
1320  if (out_string_set.size() != in_string_set.size() ||
1321  out_string_set.stride() < out_string_set.size())
1322  throw nvbio::runtime_error( "copy() : unmatched string set sizes" );
1323 
1324  const uint32 n_blocks = (in_string_set.size() + BLOCKDIM-1)/BLOCKDIM;
1325 
1326  // get the base word stream of the input
1327  const InStreamIterator in_stream = in_string_set.base_string().stream();
1328 
1329  packed_concat_to_strided_kernel<
1330  BLOCKDIM,
1331  SYMBOL_SIZE_T,
1332  BIG_ENDIAN_T>
1333  <<<n_blocks,BLOCKDIM>>>(
1334  in_string_set.size(),
1335  out_string_set.stride(),
1336  in_stream,
1337  in_string_set.offsets(),
1338  out_string_set.base_string(),
1339  out_string_set.lengths() );
1340 
1341  cudaThreadSynchronize();
1342  }
1343  };
1344 
1345  // packed-sparse input set
1346  template <
1347  typename InStreamIterator,
1348  typename SymbolType,
1349  uint32 SYMBOL_SIZE_T,
1350  bool BIG_ENDIAN_T,
1351  typename InOffsetIterator>
1352  struct source_dispatch<
1353  SparseStringSet<
1354  PackedStream<InStreamIterator,SymbolType,SYMBOL_SIZE_T,BIG_ENDIAN_T>,
1355  InOffsetIterator>
1356  >
1357  {
1358  typedef SparseStringSet<
1359  PackedStream<InStreamIterator,SymbolType,SYMBOL_SIZE_T,BIG_ENDIAN_T>,
1360  InOffsetIterator> in_string_set_type;
1361 
1362  static void enact(
1363  const in_string_set_type& in_string_set,
1364  const out_string_set_type& out_string_set)
1365  {
1366  const uint32 BLOCKDIM = 64u;
1367 
1368  if (out_string_set.size() != in_string_set.size() ||
1369  out_string_set.stride() < out_string_set.size())
1370  throw nvbio::runtime_error( "copy() : unmatched string set sizes" );
1371 
1372  const uint32 n_blocks = (in_string_set.size() + BLOCKDIM-1)/BLOCKDIM;
1373 
1374  // get the base word stream of the input
1375  const InStreamIterator in_stream = in_string_set.base_string().stream();
1376 
1377  packed_sparse_to_strided_kernel<
1378  BLOCKDIM,
1379  SYMBOL_SIZE_T,
1380  BIG_ENDIAN_T>
1381  <<<n_blocks,BLOCKDIM>>>(
1382  in_string_set.size(),
1383  out_string_set.stride(),
1384  in_stream,
1385  in_string_set.ranges(),
1386  out_string_set.base_string(),
1387  out_string_set.lengths() );
1388 
1389  cudaThreadSynchronize();
1390  }
1391  };
1392 
1393  template <typename in_string_set_type>
1394  static void enact(
1395  const in_string_set_type& in_string_set,
1396  out_string_set_type& out_string_set)
1397  {
1398  return source_dispatch<in_string_set_type>::enact( in_string_set, out_string_set );
1399  }
1400 };
1401 
1402 //
1403 // strided output set
1404 //
1405 template <
1406  typename OutLengthIterator>
1407 struct copy_dispatch<
1408  StridedStringSet<
1409  uint8*,
1410  OutLengthIterator>
1411  >
1412 {
1413  typedef StridedStringSet<uint8*, OutLengthIterator> out_string_set_type;
1414 
1415  // generic input set
1416  template <typename in_string_set_type>
1417  struct source_dispatch
1418  {
1419  static void enact(
1420  const in_string_set_type& in_string_set,
1421  out_string_set_type& out_string_set)
1422  {
1423  const uint32 BLOCKDIM = 64u;
1424 
1425  if (out_string_set.size() != in_string_set.size() ||
1426  out_string_set.stride() < out_string_set.size())
1427  throw nvbio::runtime_error( "copy() : unmatched string set sizes" );
1428 
1429  #if 0
1430  if ((out_string_set.stride() & 3) == 0)
1431  {
1432  const uint32 n_quads = (in_string_set.size()+3) / 4u;
1433  const uint32 n_blocks = (n_quads + BLOCKDIM-1)/BLOCKDIM;
1434 
1435  generic_to_strided_uint8_kernel<BLOCKDIM> <<<n_blocks,BLOCKDIM>>>(
1436  in_string_set.size(),
1437  out_string_set.stride(),
1438  in_string_set,
1439  out_string_set.base_string(),
1440  out_string_set.lengths() );
1441  }
1442  else
1443  #endif
1444  {
1445  const uint32 n_blocks = (in_string_set.size() + BLOCKDIM-1)/BLOCKDIM;
1446 
1447  generic_to_strided_kernel<BLOCKDIM> <<<n_blocks,BLOCKDIM>>>(
1448  in_string_set.size(),
1449  out_string_set.stride(),
1450  in_string_set,
1451  out_string_set.base_string(),
1452  out_string_set.lengths() );
1453  }
1454 
1455  cudaThreadSynchronize();
1456  }
1457  };
1458 
1459  // sparse input set
1460  template <
1461  typename InStringIterator,
1462  typename InRangeIterator>
1463  struct source_dispatch<
1464  SparseStringSet<
1465  InStringIterator,
1466  InRangeIterator>
1467  >
1468  {
1469  typedef SparseStringSet<
1470  InStringIterator,
1471  InRangeIterator>
1472  in_string_set_type;
1473 
1474  static void enact(
1475  const in_string_set_type& in_string_set,
1476  out_string_set_type& out_string_set)
1477  {
1478  const uint32 BLOCKDIM = 64u;
1479 
1480  if (out_string_set.size() != in_string_set.size() ||
1481  out_string_set.stride() < out_string_set.size())
1482  throw nvbio::runtime_error( "copy() : unmatched string set sizes" );
1483 
1484  const uint32 n_blocks = (in_string_set.size() + BLOCKDIM-1)/BLOCKDIM;
1485 
1486  contig_to_strided_uint8_kernel<BLOCKDIM> <<<n_blocks,BLOCKDIM>>>(
1487  in_string_set.size(),
1488  out_string_set.stride(),
1489  in_string_set,
1490  out_string_set.base_string(),
1491  out_string_set.lengths() );
1492 
1493  cudaThreadSynchronize();
1494  }
1495  };
1496 
1497  // concatenated input set
1498  template <
1499  typename InStringIterator,
1500  typename InOffsetIterator>
1501  struct source_dispatch<
1502  ConcatenatedStringSet<
1503  InStringIterator,
1504  InOffsetIterator>
1505  >
1506  {
1507  typedef ConcatenatedStringSet<
1508  InStringIterator,
1509  InOffsetIterator>
1510  in_string_set_type;
1511 
1512  static void enact(
1513  const in_string_set_type& in_string_set,
1514  out_string_set_type& out_string_set)
1515  {
1516  const uint32 BLOCKDIM = 64u;
1517 
1518  if (out_string_set.size() != in_string_set.size() ||
1519  out_string_set.stride() < out_string_set.size())
1520  throw nvbio::runtime_error( "copy() : unmatched string set sizes" );
1521 
1522  const uint32 n_blocks = (in_string_set.size() + BLOCKDIM-1)/BLOCKDIM;
1523 
1524  contig_to_strided_uint8_kernel<BLOCKDIM> <<<n_blocks,BLOCKDIM>>>(
1525  in_string_set.size(),
1526  out_string_set.stride(),
1527  in_string_set,
1528  out_string_set.base_string(),
1529  out_string_set.lengths() );
1530 
1531  cudaThreadSynchronize();
1532  }
1533  };
1534 
1535  // strided-packed input set
1536  template <
1537  typename SymbolType,
1538  uint32 SYMBOL_SIZE_T,
1539  uint32 BIG_ENDIAN_T,
1540  typename InLengthIterator>
1541  struct source_dispatch<
1542  StridedPackedStringSet<
1543  uint32*,
1544  SymbolType,
1545  SYMBOL_SIZE_T,
1546  BIG_ENDIAN_T,
1547  InLengthIterator>
1548  >
1549  {
1550  typedef StridedPackedStringSet<
1551  uint32*,
1552  SymbolType,
1553  SYMBOL_SIZE_T,
1554  BIG_ENDIAN_T,
1555  InLengthIterator>
1556  in_string_set_type;
1557 
1558  static void enact(
1559  const in_string_set_type& in_string_set,
1560  out_string_set_type& out_string_set)
1561  {
1562  const uint32 BLOCKDIM = 64u;
1563 
1564  if (out_string_set.size() != in_string_set.size() ||
1565  out_string_set.stride() < out_string_set.size())
1566  throw nvbio::runtime_error( "copy() : unmatched string set sizes" );
1567 
1568  // optimize if the stride is a multiple of 4 (i.e. if we can write out full 32-bit words)
1569  if ((out_string_set.stride() & 3) == 0)
1570  {
1571  const uint32 n_quads = (in_string_set.size()+3u) / 4u;
1572  const uint32 n_blocks = (n_quads + BLOCKDIM-1)/BLOCKDIM;
1573 
1574  strided_packed_to_strided_uint8_kernel<BLOCKDIM> <<<n_blocks,BLOCKDIM>>>(
1575  in_string_set.size(),
1576  out_string_set.stride(),
1577  in_string_set,
1578  out_string_set.base_string(),
1579  out_string_set.lengths() );
1580  }
1581  else
1582  {
1583  const uint32 n_blocks = (in_string_set.size() + BLOCKDIM-1)/BLOCKDIM;
1584 
1585  generic_to_strided_kernel<BLOCKDIM> <<<n_blocks,BLOCKDIM>>>(
1586  in_string_set.size(),
1587  out_string_set.stride(),
1588  in_string_set,
1589  out_string_set.base_string(),
1590  out_string_set.lengths() );
1591  }
1592  cudaThreadSynchronize();
1593  }
1594  };
1595  // strided-packed input set
1596  template <
1597  typename SymbolType,
1598  uint32 SYMBOL_SIZE_T,
1599  uint32 BIG_ENDIAN_T,
1600  typename InLengthIterator>
1601  struct source_dispatch<
1602  StridedPackedStringSet<
1603  const uint32*,
1604  SymbolType,
1605  SYMBOL_SIZE_T,
1606  BIG_ENDIAN_T,
1607  InLengthIterator>
1608  >
1609  {
1610  typedef StridedPackedStringSet<
1611  const uint32*,
1612  SymbolType,
1613  SYMBOL_SIZE_T,
1614  BIG_ENDIAN_T,
1615  InLengthIterator>
1616  in_string_set_type;
1617 
1618  static void enact(
1619  const in_string_set_type& in_string_set,
1620  out_string_set_type& out_string_set)
1621  {
1622  const uint32 BLOCKDIM = 64u;
1623 
1624  if (out_string_set.size() != in_string_set.size() ||
1625  out_string_set.stride() < out_string_set.size())
1626  throw nvbio::runtime_error( "copy() : unmatched string set sizes" );
1627 
1628  // optimize if the stride is a multiple of 4 (i.e. if we can write out full 32-bit words)
1629  if ((out_string_set.stride() & 3) == 0)
1630  {
1631  const uint32 n_quads = (in_string_set.size()+3u) / 4u;
1632  const uint32 n_blocks = (n_quads + BLOCKDIM-1)/BLOCKDIM;
1633 
1634  strided_packed_to_strided_uint8_kernel<BLOCKDIM> <<<n_blocks,BLOCKDIM>>>(
1635  in_string_set.size(),
1636  out_string_set.stride(),
1637  in_string_set,
1638  out_string_set.base_string(),
1639  out_string_set.lengths() );
1640  }
1641  else
1642  {
1643  const uint32 n_blocks = (in_string_set.size() + BLOCKDIM-1)/BLOCKDIM;
1644 
1645  generic_to_strided_kernel<BLOCKDIM> <<<n_blocks,BLOCKDIM>>>(
1646  in_string_set.size(),
1647  out_string_set.stride(),
1648  in_string_set,
1649  out_string_set.base_string(),
1650  out_string_set.lengths() );
1651  }
1652  cudaThreadSynchronize();
1653  }
1654  };
1655 
1656  // packed-concat input set
1657  template <
1658  typename InStreamIterator,
1659  typename SymbolType,
1660  uint32 SYMBOL_SIZE_T,
1661  bool BIG_ENDIAN_T,
1662  typename InOffsetIterator>
1663  struct source_dispatch<
1664  ConcatenatedStringSet<
1665  PackedStream<InStreamIterator,SymbolType,SYMBOL_SIZE_T,BIG_ENDIAN_T>,
1666  InOffsetIterator>
1667  >
1668  {
1669  typedef ConcatenatedStringSet<
1670  PackedStream<InStreamIterator,SymbolType,SYMBOL_SIZE_T,BIG_ENDIAN_T>,
1671  InOffsetIterator> in_string_set_type;
1672 
1673  static void enact(
1674  const in_string_set_type& in_string_set,
1675  const out_string_set_type& out_string_set)
1676  {
1677  const uint32 BLOCKDIM = 64u;
1678 
1679  if (out_string_set.size() != in_string_set.size() ||
1680  out_string_set.stride() < out_string_set.size())
1681  throw nvbio::runtime_error( "copy() : unmatched string set sizes" );
1682 
1683  const uint32 n_blocks = (in_string_set.size() + BLOCKDIM-1)/BLOCKDIM;
1684 
1685  // get the base word stream of the input
1686  const InStreamIterator in_stream = in_string_set.base_string().stream();
1687 
1688  packed_concat_to_strided_kernel<
1689  BLOCKDIM,
1690  SYMBOL_SIZE_T,
1691  BIG_ENDIAN_T>
1692  <<<n_blocks,BLOCKDIM>>>(
1693  in_string_set.size(),
1694  out_string_set.stride(),
1695  in_stream,
1696  in_string_set.offsets(),
1697  out_string_set.base_string(),
1698  out_string_set.lengths() );
1699 
1700  cudaThreadSynchronize();
1701  }
1702  };
1703 
1704  // packed-sparse input set
1705  template <
1706  typename InStreamIterator,
1707  typename SymbolType,
1708  uint32 SYMBOL_SIZE_T,
1709  bool BIG_ENDIAN_T,
1710  typename InOffsetIterator>
1711  struct source_dispatch<
1712  SparseStringSet<
1713  PackedStream<InStreamIterator,SymbolType,SYMBOL_SIZE_T,BIG_ENDIAN_T>,
1714  InOffsetIterator>
1715  >
1716  {
1717  typedef SparseStringSet<
1718  PackedStream<InStreamIterator,SymbolType,SYMBOL_SIZE_T,BIG_ENDIAN_T>,
1719  InOffsetIterator> in_string_set_type;
1720 
1721  static void enact(
1722  const in_string_set_type& in_string_set,
1723  const out_string_set_type& out_string_set)
1724  {
1725  const uint32 BLOCKDIM = 64u;
1726 
1727  if (out_string_set.size() != in_string_set.size() ||
1728  out_string_set.stride() < out_string_set.size())
1729  throw nvbio::runtime_error( "copy() : unmatched string set sizes" );
1730 
1731  const uint32 n_blocks = (in_string_set.size() + BLOCKDIM-1)/BLOCKDIM;
1732 
1733  // get the base word stream of the input
1734  const InStreamIterator in_stream = in_string_set.base_string().stream();
1735 
1736  packed_sparse_to_strided_kernel<
1737  BLOCKDIM,
1738  SYMBOL_SIZE_T,
1739  BIG_ENDIAN_T>
1740  <<<n_blocks,BLOCKDIM>>>(
1741  in_string_set.size(),
1742  out_string_set.stride(),
1743  in_stream,
1744  in_string_set.ranges(),
1745  out_string_set.base_string(),
1746  out_string_set.lengths() );
1747 
1748  cudaThreadSynchronize();
1749  }
1750  };
1751 
1752  template <typename in_string_set_type>
1753  static void enact(
1754  const in_string_set_type& in_string_set,
1755  out_string_set_type& out_string_set)
1756  {
1757  return source_dispatch<in_string_set_type>::enact( in_string_set, out_string_set );
1758  }
1759 };
1760 
1761 
1762 //
1763 // strided-packed output string sets.
1764 //
1765 template <
1766  typename SymbolType,
1767  uint32 SYMBOL_SIZE_T,
1768  bool BIG_ENDIAN_T,
1769  typename OutStreamIterator,
1770  typename OutLengthIterator>
1771 struct copy_dispatch<
1772  StridedPackedStringSet<
1773  OutStreamIterator,SymbolType,SYMBOL_SIZE_T,BIG_ENDIAN_T,
1774  OutLengthIterator>
1775  >
1776 {
1777  typedef StridedPackedStringSet<
1778  OutStreamIterator,SymbolType,SYMBOL_SIZE_T,BIG_ENDIAN_T,
1779  OutLengthIterator> out_string_set_type;
1780 
1781  // generic input set
1782  template <typename in_string_set_type>
1783  struct source_dispatch
1784  {
1785  static void enact(
1786  const in_string_set_type& in_string_set,
1787  const out_string_set_type& out_string_set)
1788  {
1789  const uint32 BLOCKDIM = 64u;
1790 
1791  if (out_string_set.size() != in_string_set.size() ||
1792  out_string_set.stride() < out_string_set.size())
1793  throw nvbio::runtime_error( "copy() : unmatched string set sizes" );
1794 
1795  const uint32 n_blocks = (in_string_set.size() + BLOCKDIM-1)/BLOCKDIM;
1796 
1797  generic_to_strided_packed_kernel<
1798  BLOCKDIM,
1799  SYMBOL_SIZE_T,
1800  BIG_ENDIAN_T>
1801  <<<n_blocks,BLOCKDIM>>>(
1802  in_string_set.size(),
1803  out_string_set.stride(),
1804  in_string_set,
1805  out_string_set.base_stream(),
1806  out_string_set.lengths() );
1807 
1808  cudaThreadSynchronize();
1809  }
1810  };
1811 
1812  // concatenated input set
1813  template <
1814  typename InStreamIterator,
1815  typename InOffsetIterator>
1816  struct source_dispatch<
1817  ConcatenatedStringSet<
1818  InStreamIterator,
1819  InOffsetIterator>
1820  >
1821  {
1822  typedef ConcatenatedStringSet<
1823  InStreamIterator,
1824  InOffsetIterator> in_string_set_type;
1825 
1826  static void enact(
1827  const in_string_set_type& in_string_set,
1828  const out_string_set_type& out_string_set)
1829  {
1830  const uint32 BLOCKDIM = 128u;
1831 
1832  if (out_string_set.size() != in_string_set.size() ||
1833  out_string_set.stride() < out_string_set.size())
1834  throw nvbio::runtime_error( "copy() : unmatched string set sizes" );
1835 
1836  const uint32 n_blocks = (in_string_set.size() + BLOCKDIM-1)/BLOCKDIM;
1837 
1838  contig_to_strided_packed_kernel<
1839  BLOCKDIM,
1840  SYMBOL_SIZE_T,
1841  BIG_ENDIAN_T>
1842  <<<n_blocks,BLOCKDIM>>>(
1843  in_string_set.size(),
1844  out_string_set.stride(),
1845  in_string_set,
1846  out_string_set.base_stream(),
1847  out_string_set.lengths() );
1848 
1849  cudaThreadSynchronize();
1850  }
1851  };
1852 
1853  // sparse input set
1854  template <
1855  typename InStreamIterator,
1856  typename InOffsetIterator>
1857  struct source_dispatch<
1858  SparseStringSet<
1859  InStreamIterator,
1860  InOffsetIterator>
1861  >
1862  {
1863  typedef SparseStringSet<
1864  InStreamIterator,
1865  InOffsetIterator> in_string_set_type;
1866 
1867  static void enact(
1868  const in_string_set_type& in_string_set,
1869  const out_string_set_type& out_string_set)
1870  {
1871  const uint32 BLOCKDIM = 64u;
1872 
1873  if (out_string_set.size() != in_string_set.size() ||
1874  out_string_set.stride() < out_string_set.size())
1875  throw nvbio::runtime_error( "copy() : unmatched string set sizes" );
1876 
1877  const uint32 n_blocks = (in_string_set.size() + BLOCKDIM-1)/BLOCKDIM;
1878 
1879  contig_to_strided_packed_kernel<
1880  BLOCKDIM,
1881  SYMBOL_SIZE_T,
1882  BIG_ENDIAN_T>
1883  <<<n_blocks,BLOCKDIM>>>(
1884  in_string_set.size(),
1885  out_string_set.stride(),
1886  in_string_set,
1887  out_string_set.base_stream(),
1888  out_string_set.lengths() );
1889 
1890  cudaThreadSynchronize();
1891  }
1892  };
1893 
1894  // packed-concatenated input set
1895  template <
1896  typename InStreamIterator,
1897  typename InOffsetIterator>
1898  struct source_dispatch<
1899  ConcatenatedStringSet<
1900  PackedStream<InStreamIterator,SymbolType,SYMBOL_SIZE_T,BIG_ENDIAN_T>,
1901  InOffsetIterator>
1902  >
1903  {
1904  typedef ConcatenatedStringSet<
1905  PackedStream<InStreamIterator,SymbolType,SYMBOL_SIZE_T,BIG_ENDIAN_T>,
1906  InOffsetIterator> in_string_set_type;
1907 
1908  static void enact(
1909  const in_string_set_type& in_string_set,
1910  const out_string_set_type& out_string_set)
1911  {
1912  const uint32 BLOCKDIM = 64;
1913 
1914  if (out_string_set.size() != in_string_set.size() ||
1915  out_string_set.stride() < out_string_set.size())
1916  throw nvbio::runtime_error( "copy() : unmatched string set sizes" );
1917 
1918  const uint32 n_blocks = (in_string_set.size() + BLOCKDIM-1)/BLOCKDIM;
1919 
1920  // get the base word stream of the input
1921  const InStreamIterator in_stream = in_string_set.base_string().stream();
1922 
1923  packed_concatenated_to_strided_packed_kernel<
1924  BLOCKDIM,
1925  SYMBOL_SIZE_T,
1926  BIG_ENDIAN_T>
1927  <<<n_blocks,BLOCKDIM>>>(
1928  in_string_set.size(),
1929  out_string_set.stride(),
1930  in_stream,
1931  in_string_set.offsets(),
1932  out_string_set.base_stream(),
1933  out_string_set.lengths() );
1934 
1935  cudaThreadSynchronize();
1936  }
1937  };
1938 
1939  // packed-sparse input set
1940  template <
1941  typename InStreamIterator,
1942  typename InOffsetIterator>
1943  struct source_dispatch<
1944  SparseStringSet<
1945  PackedStream<InStreamIterator,SymbolType,SYMBOL_SIZE_T,BIG_ENDIAN_T>,
1946  InOffsetIterator>
1947  >
1948  {
1949  typedef SparseStringSet<
1950  PackedStream<InStreamIterator,SymbolType,SYMBOL_SIZE_T,BIG_ENDIAN_T>,
1951  InOffsetIterator> in_string_set_type;
1952 
1953  static void enact(
1954  const in_string_set_type& in_string_set,
1955  const out_string_set_type& out_string_set)
1956  {
1957  const uint32 BLOCKDIM = 64u;
1958 
1959  if (out_string_set.size() != in_string_set.size() ||
1960  out_string_set.stride() < out_string_set.size())
1961  throw nvbio::runtime_error( "copy() : unmatched string set sizes" );
1962 
1963  const uint32 n_blocks = (in_string_set.size() + BLOCKDIM-1)/BLOCKDIM;
1964 
1965  // get the base word stream of the input
1966  const InStreamIterator in_stream = in_string_set.base_string().stream();
1967 
1968  packed_sparse_to_strided_packed_kernel<
1969  BLOCKDIM,
1970  SYMBOL_SIZE_T,
1971  BIG_ENDIAN_T>
1972  <<<n_blocks,BLOCKDIM>>>(
1973  in_string_set.size(),
1974  out_string_set.stride(),
1975  in_stream,
1976  in_string_set.ranges(),
1977  out_string_set.base_stream(),
1978  out_string_set.lengths() );
1979 
1980  cudaThreadSynchronize();
1981  }
1982  };
1983 
1984  template <typename in_string_set_type>
1985  static void enact(
1986  const in_string_set_type& in_string_set,
1987  out_string_set_type& out_string_set)
1988  {
1989  return source_dispatch<in_string_set_type>::enact( in_string_set, out_string_set );
1990  }
1991 };
1992 
1993 // copy a generic string set into a concatenated one
1994 //
1995 // \param in_string_set input string set
1996 // \param out_string_set output string set
1997 //
1998 template <
1999  typename InStringSet,
2000  typename StringIterator,
2001  typename OffsetIterator>
2002 void copy(
2003  const InStringSet& in_string_set,
2004  ConcatenatedStringSet<StringIterator,OffsetIterator>& out_string_set)
2005 {
2006  typedef ConcatenatedStringSet<StringIterator,OffsetIterator> OutStringSet;
2007 
2008  copy_dispatch<OutStringSet>::enact( in_string_set, out_string_set );
2009 }
2010 
2011 // copy a generic string set into a strided one
2012 //
2013 // \param in_string_set input string set
2014 // \param out_string_set output string set
2015 //
2016 template <
2017  typename InStringSet,
2018  typename StringIterator,
2019  typename LengthIterator>
2020 void copy(
2021  const InStringSet& in_string_set,
2022  StridedStringSet<StringIterator,LengthIterator>& out_string_set)
2023 {
2024  typedef StridedStringSet<StringIterator,LengthIterator> OutStringSet;
2025 
2026  copy_dispatch<OutStringSet>::enact( in_string_set, out_string_set );
2027 }
2028 
2029 // copy a generic string set into a strided-packed one
2030 //
2031 // \param in_string_set input string set
2032 // \param out_string_set output string set
2033 //
2034 template <
2035  typename InStringSet,
2036  typename StreamIterator,
2037  typename SymbolType,
2038  uint32 SYMBOL_SIZE_T,
2039  bool BIG_ENDIAN_T,
2040  typename LengthIterator>
2041 void copy(
2042  const InStringSet& in_string_set,
2043  StridedPackedStringSet<StreamIterator,SymbolType,SYMBOL_SIZE_T,BIG_ENDIAN_T,LengthIterator>& out_string_set)
2044 {
2045  typedef StridedPackedStringSet<StreamIterator,SymbolType,SYMBOL_SIZE_T,BIG_ENDIAN_T,LengthIterator> OutStringSet;
2046 
2047  copy_dispatch<OutStringSet>::enact( in_string_set, out_string_set );
2048 }
2049 
2050 } // namespace cuda
2051 
2052 #endif // defined(__CUDACC__)
2053 
2054 template <typename out_string_set_type>
2056 {
2057  // generic input set
2058  template <typename in_string_set_type>
2060  {
2061  static void enact(
2062  const in_string_set_type& in_string_set,
2063  const out_string_set_type& out_string_set)
2064  {
2065  if (out_string_set.size() != in_string_set.size() ||
2066  out_string_set.stride() < out_string_set.size())
2067  throw nvbio::runtime_error( "copy() : unmatched string set sizes" );
2068 
2069  const uint32 n_strings = in_string_set.size();
2070 
2071  #if 1
2072  const uint32 BLOCK_SIZE = 16;
2073  for (uint32 i_block = 0; i_block < n_strings; i_block += BLOCK_SIZE)
2074  {
2075  const uint32 i_block_end = std::min( i_block + BLOCK_SIZE, n_strings );
2076 
2077  uint32 max_len = 0;
2078  for (uint32 i = i_block; i < i_block_end; ++i)
2079  max_len = std::max( max_len, in_string_set[i].length() );
2080 
2081  for (uint32 j_block = 0; j_block < max_len; j_block += BLOCK_SIZE)
2082  {
2083  for (uint32 i = i_block; i < i_block_end; ++i)
2084  {
2085  typename in_string_set_type::string_type in_string = in_string_set[i];
2086  typename out_string_set_type::string_type out_string = out_string_set[i];
2087 
2088  const uint32 m = in_string.length();
2089  const uint32 j_block_end = std::min( j_block + BLOCK_SIZE, m );
2090 
2091  for (uint32 j = j_block; j < j_block_end; ++j)
2092  out_string[j] = in_string[j];
2093  }
2094  }
2095  }
2096  #else
2097  for (uint32 i = 0; i < n_strings; ++i)
2098  {
2099  typename in_string_set_type::string_type in_string = in_string_set[i];
2100  typename out_string_set_type::string_type out_string = out_string_set[i];
2101 
2102  const uint32 m = in_string.length();
2103  for (uint32 j = 0; j < m; ++j)
2104  out_string[j] = in_string[j];
2105  }
2106  #endif
2107  }
2108  };
2109 
2110  template <typename in_string_set_type>
2111  static void enact(
2112  const in_string_set_type& in_string_set,
2113  out_string_set_type& out_string_set)
2114  {
2115  return source_dispatch<in_string_set_type>::enact( in_string_set, out_string_set );
2116  }
2117 };
2118 
2119 // copy a generic string set into a concatenated one
2120 //
2121 // \param in_string_set input string set
2122 // \param out_string_set output string set
2123 //
2124 template <
2125  typename InStringSet,
2126  typename StringIterator,
2127  typename OffsetIterator>
2128 void copy(
2129  const InStringSet& in_string_set,
2131 {
2133 
2134  copy_dispatch<OutStringSet>::enact( in_string_set, out_string_set );
2135 }
2136 
2137 // copy a generic string set into a strided one
2138 //
2139 // \param in_string_set input string set
2140 // \param out_string_set output string set
2141 //
2142 template <
2143  typename InStringSet,
2144  typename StringIterator,
2145  typename LengthIterator>
2146 void copy(
2147  const InStringSet& in_string_set,
2149 {
2151 
2152  copy_dispatch<OutStringSet>::enact( in_string_set, out_string_set );
2153 }
2154 
2155 // copy a generic string set into a strided-packed one
2156 //
2157 // \param in_string_set input string set
2158 // \param out_string_set output string set
2159 //
2160 template <
2161  typename InStringSet,
2162  typename StreamIterator,
2163  typename SymbolType,
2164  uint32 SYMBOL_SIZE_T,
2165  bool BIG_ENDIAN_T,
2166  typename LengthIterator>
2167 void copy(
2168  const InStringSet& in_string_set,
2170 {
2172 
2173  copy_dispatch<OutStringSet>::enact( in_string_set, out_string_set );
2174 }
2175 
2176 } // namespace nvbio