NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
batched_inl.h
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #pragma once
29 
30 #include <nvbio/alignment/utils.h>
32 #include <nvbio/basic/types.h>
36 #include <nvbio/basic/vector.h>
38 #if defined(_OPENMP)
39 #include <omp.h>
40 #endif
41 
42 namespace nvbio {
43 namespace aln {
44 
47 
48 template <typename stream_type, typename column_type>
51 {
52  typedef typename stream_type::aligner_type aligner_type;
53  typedef typename stream_type::context_type context_type;
54  typedef typename stream_type::strings_type strings_type;
55 
56  // load the alignment context
57  context_type context;
58  if (stream.init_context( work_id, &context ) == false)
59  {
60  // handle the output
61  stream.output( work_id, &context );
62  return;
63  }
64 
65  // compute the end of the current DP matrix window
66  const uint32 len = equal<typename aligner_type::algorithm_tag,PatternBlockingTag>() ?
67  stream.pattern_length( work_id, &context ) :
68  stream.text_length( work_id, &context );
69 
70  // load the strings to be aligned
71  strings_type strings;
72  stream.load_strings( work_id, 0, len, &context, &strings );
73 
74  // score the current DP matrix window
76  stream.aligner(),
77  strings.pattern,
78  strings.quals,
79  strings.text,
80  context.min_score,
81  context.sink,
82  column );
83 
84  // handle the output
85  stream.output( work_id, &context );
86 }
87 
88 template <uint32 BLOCKDIM, uint32 MINBLOCKS, uint32 COLUMN_SIZE, typename stream_type, typename cell_type>
89 __global__ void
90 __launch_bounds__(BLOCKDIM,MINBLOCKS)
91 lmem_batched_alignment_score_kernel(stream_type stream, cell_type* columns, const uint32 stride)
92 {
93  const uint32 tid = blockIdx.x * BLOCKDIM + threadIdx.x;
94 
95  if (tid >= stream.size())
96  return;
97 
98  // fetch the proper column storage
99  cell_type column[COLUMN_SIZE];
100 
101  batched_alignment_score( stream, column, tid, tid );
102 }
103 
104 template <uint32 BLOCKDIM, uint32 MINBLOCKS, typename stream_type, typename cell_type>
105 __global__ void
106 __launch_bounds__(BLOCKDIM,MINBLOCKS)
107 batched_alignment_score_kernel(stream_type stream, cell_type* columns, const uint32 stride)
108 {
109  const uint32 tid = blockIdx.x * BLOCKDIM + threadIdx.x;
110 
111  if (tid >= stream.size())
112  return;
113 
114  // fetch the proper column storage
116  column_type column = column_type( columns + tid, stride );
117 
118  batched_alignment_score( stream, column, tid, tid );
119 }
120 
121 template <uint32 BLOCKDIM, uint32 MINBLOCKS, typename stream_type, typename cell_type>
122 __global__ void
123 __launch_bounds__(BLOCKDIM,MINBLOCKS)
124 persistent_batched_alignment_score_kernel(stream_type stream, cell_type* columns, const uint32 stride)
125 {
126  const uint32 grid_threads = gridDim.x * BLOCKDIM;
127  const uint32 thread_id = threadIdx.x + blockIdx.x*BLOCKDIM;
128 
129  const uint32 stream_end = stream.size();
130 
131  // fetch the proper column storage
133  column_type column = column_type( columns + thread_id, stride );
134 
135  // let this CTA fetch all tiles at a grid-threads stride, starting from blockIdx.x*BLOCKDIM
136  for (uint32 stream_begin = 0; stream_begin < stream_end; stream_begin += grid_threads)
137  {
138  const uint32 work_id = thread_id + stream_begin;
139 
140  if (work_id < stream_end)
141  batched_alignment_score( stream, column, work_id, thread_id );
142  }
143 }
144 
145 template <uint32 BLOCKDIM, typename stream_type, typename cell_type>
147 void warp_batched_alignment_score(stream_type& stream, cell_type* columns, const uint32 stride, const uint32 work_id, const uint32 warp_id)
148 {
149  typedef typename stream_type::aligner_type aligner_type;
150  typedef typename stream_type::context_type context_type;
151  typedef typename stream_type::strings_type strings_type;
152 
153  // load the alignment context
154  context_type context;
155  if (stream.init_context( work_id, &context ) == false)
156  {
157  if (warp_tid() == 0)
158  {
159  // handle the output
160  stream.output( work_id, &context );
161  }
162  return;
163  }
164 
165  // compute the end of the current DP matrix window
166  const uint32 len = equal<typename aligner_type::algorithm_tag,PatternBlockingTag>() ?
167  stream.pattern_length( work_id, &context ) :
168  stream.text_length( work_id, &context );
169 
170  // load the strings to be aligned
171  strings_type strings;
172  stream.load_strings( work_id, 0, len, &context, &strings );
173 
174  // fetch the proper column storage
176  column_type column = column_type( columns + warp_id * cuda::Arch::WARP_SIZE, stride );
177 
178  // score the current DP matrix window
179  uint2 sink;
180  const int32 score = warp::alignment_score<BLOCKDIM>(
181  stream.aligner(),
182  strings.pattern,
183  strings.quals,
184  strings.text,
185  context.min_score,
186  &sink,
187  column );
188 
189  if (warp_tid() == 0)
190  {
191  context.sink.report( score, sink );
192 
193  // handle the output
194  stream.output( work_id, &context );
195  }
196 }
197 
198 template <uint32 BLOCKDIM, typename stream_type, typename cell_type>
199 __global__ void warp_batched_alignment_score_kernel(stream_type stream, cell_type* columns, const uint32 stride)
200 {
201  const uint32 wid = (blockIdx.x * BLOCKDIM + threadIdx.x) >> cuda::Arch::LOG_WARP_SIZE;
202 
203  if (wid >= stream.size())
204  return;
205 
206  warp_batched_alignment_score<BLOCKDIM>( stream, columns, stride, wid, wid );
207 }
208 
209 template <uint32 BLOCKDIM, typename stream_type, typename cell_type>
210 __global__ void warp_persistent_batched_alignment_score_kernel(stream_type stream, cell_type* columns, const uint32 stride)
211 {
212  const uint32 grid_warps = (gridDim.x * BLOCKDIM) >> cuda::Arch::LOG_WARP_SIZE;
213  const uint32 wid = (threadIdx.x + blockIdx.x*BLOCKDIM) >> cuda::Arch::LOG_WARP_SIZE;
214 
215  const uint32 stream_end = stream.size();
216 
217  // let this CTA fetch all tiles at a grid-warps stride
218  for (uint32 work_id = wid; work_id < stream_end; work_id += grid_warps)
219  warp_batched_alignment_score<BLOCKDIM>( stream, columns, stride, work_id, wid );
220 }
221 
223 
226 
230 
236 template <typename stream_type>
238 {
239  static const uint32 MAX_THREADS = 128; // whatever CPU we have, we assume we are never going to have more than this number of threads
240 
241  typedef typename stream_type::aligner_type aligner_type;
243 
246  static uint32 column_storage(const uint32 max_pattern_len, const uint32 max_text_len)
247  {
248  const uint32 column_size = equal<typename aligner_type::algorithm_tag,PatternBlockingTag>() ?
249  uint32( max_text_len * sizeof(cell_type) ) :
250  uint32( max_pattern_len * sizeof(cell_type) );
251 
252  return align<4>( column_size );
253  }
254 
257  static uint64 min_temp_storage(const uint32 max_pattern_len, const uint32 max_text_len, const uint32 stream_size);
258 
261  static uint64 max_temp_storage(const uint32 max_pattern_len, const uint32 max_text_len, const uint32 stream_size);
262 
265  void enact(stream_type stream, uint64 temp_size = 0u, uint8* temp = NULL);
266 };
267 
268 // return the minimum number of bytes required by the algorithm
269 //
270 template <typename stream_type>
272 {
273  return column_storage( max_pattern_len, max_text_len ) * MAX_THREADS;
274 }
275 
276 // return the maximum number of bytes required by the algorithm
277 //
278 template <typename stream_type>
280 {
281  return column_storage( max_pattern_len, max_text_len ) * MAX_THREADS;
282 }
283 
284 // enact the batch execution
285 //
286 template <typename stream_type>
288 {
289  const uint32 column_size = equal<typename aligner_type::algorithm_tag,PatternBlockingTag>() ?
290  uint32( stream.max_pattern_length() ) :
291  uint32( stream.max_text_length() );
292 
293  const uint64 min_temp_size = min_temp_storage(
294  stream.max_pattern_length(),
295  stream.max_text_length(),
296  stream.size() );
297 
298  nvbio::vector<host_tag,uint8> temp_vec( min_temp_size );
299  cell_type* columns = (cell_type*)nvbio::raw_pointer( temp_vec );
300 
301  #if defined(_OPENMP)
302  #pragma omp parallel for
303  #endif
304  for (int work_id = 0; work_id < int( stream.size() ); ++work_id)
305  {
306  #if defined(_OPENMP)
307  const uint32 thread_id = omp_get_thread_num();
308  #else
309  const uint32 thread_id = 0;
310  #endif
311 
312  // fetch the proper column storage
313  //typedef strided_iterator<cell_type*> column_type;
314  //column_type column = column_type( columns + thread_id, queue_capacity );
315 
316  // for the CPU it might be better to keep column storage contiguous
317  cell_type* column = columns + thread_id * column_size;
318 
319  // and solve the actual alignment problem
320  batched_alignment_score( stream, column, work_id, thread_id );
321  }
322 }
323 
329 template <uint32 BLOCKDIM, uint32 MINBLOCKS, typename stream_type>
331 {
332  typedef typename stream_type::aligner_type aligner_type;
334 
337  static uint32 column_storage(const uint32 max_pattern_len, const uint32 max_text_len)
338  {
339  const uint32 column_size = equal<typename aligner_type::algorithm_tag,PatternBlockingTag>() ?
340  uint32( max_text_len * sizeof(cell_type) ) :
341  uint32( max_pattern_len * sizeof(cell_type) );
342 
343  return align<4>( column_size );
344  }
345 
348  static uint64 min_temp_storage(const uint32 max_pattern_len, const uint32 max_text_len, const uint32 stream_size);
349 
352  static uint64 max_temp_storage(const uint32 max_pattern_len, const uint32 max_text_len, const uint32 stream_size);
353 
356  void enact(stream_type stream, uint64 temp_size = 0u, uint8* temp = NULL);
357 };
358 
359 // return the minimum number of bytes required by the algorithm
360 //
361 template <uint32 BLOCKDIM, uint32 MINBLOCKS, typename stream_type>
363 {
364  return column_storage( max_pattern_len, max_text_len ) * 1024;
365 }
366 
367 // return the maximum number of bytes required by the algorithm
368 //
369 template <uint32 BLOCKDIM, uint32 MINBLOCKS, typename stream_type>
371 {
372  return align<32>( column_storage( max_pattern_len, max_text_len ) * stream_size );
373 }
374 
375 // enact the batch execution
376 //
377 template <uint32 BLOCKDIM, uint32 MINBLOCKS, typename stream_type>
379 {
380  const uint32 column_size = equal<typename aligner_type::algorithm_tag,PatternBlockingTag>() ?
381  uint32( stream.max_text_length() ) :
382  uint32( stream.max_pattern_length() );
383 
384  // if the column is small, let's just use statically allocated local memory
385  if (column_size <= 1024)
386  {
387  const uint32 n_blocks = (stream.size() + BLOCKDIM-1) / BLOCKDIM;
388 
389  lmem_batched_alignment_score_kernel<BLOCKDIM,MINBLOCKS,1024> <<<n_blocks, BLOCKDIM>>>(
390  stream,
391  (cell_type*)temp,
392  stream.size() );
393  }
394  else
395  {
396  // the column is large to be allocated in local memory, let's use global memory
397  const uint64 min_temp_size = min_temp_storage(
398  stream.max_pattern_length(),
399  stream.max_text_length(),
400  stream.size() );
401 
403  if (temp == NULL)
404  {
405  temp_size = nvbio::max( min_temp_size, temp_size );
406  temp_vec.resize( temp_size );
407  temp = nvbio::plain_view( temp_vec );
408  }
409 
410  // set the queue capacity based on available memory
411  const uint32 queue_capacity = uint32( temp_size / column_storage( stream.max_pattern_length(), stream.max_text_length() ) );
412 
413  if (queue_capacity >= stream.size())
414  {
415  const uint32 n_blocks = (stream.size() + BLOCKDIM-1) / BLOCKDIM;
416 
417  batched_alignment_score_kernel<BLOCKDIM,MINBLOCKS> <<<n_blocks, BLOCKDIM>>>(
418  stream,
419  (cell_type*)temp,
420  stream.size() );
421  }
422  else
423  {
424  // compute the number of blocks we are going to launch
425  const uint32 n_blocks = nvbio::max( nvbio::min(
426  (uint32)cuda::max_active_blocks( persistent_batched_alignment_score_kernel<BLOCKDIM,MINBLOCKS,stream_type,cell_type>, BLOCKDIM, 0u ),
427  queue_capacity / BLOCKDIM ), 1u );
428 
429  persistent_batched_alignment_score_kernel<BLOCKDIM,MINBLOCKS> <<<n_blocks, BLOCKDIM>>>(
430  stream,
431  (cell_type*)temp,
432  queue_capacity );
433  }
434  }
435 }
436 
442 template <typename stream_type>
444 {
445  static const uint32 BLOCKDIM = 128;
446 
447  typedef typename stream_type::aligner_type aligner_type;
449 
452  static uint64 min_temp_storage(const uint32 max_pattern_len, const uint32 max_text_len, const uint32 stream_size);
453 
456  static uint64 max_temp_storage(const uint32 max_pattern_len, const uint32 max_text_len, const uint32 stream_size);
457 
460  void enact(stream_type stream, uint64 temp_size = 0u, uint8* temp = NULL);
461 };
462 
463 // return the minimum number of bytes required by the algorithm
464 //
465 template <typename stream_type>
467 {
468  return max_text_len * sizeof(cell_type) * 1024;
469 }
470 
471 // return the maximum number of bytes required by the algorithm
472 //
473 template <typename stream_type>
475 {
476  return max_text_len * sizeof(cell_type) * stream_size;
477 }
478 
479 // enact the batch execution
480 //
481 template <typename stream_type>
483 {
484  const uint64 min_temp_size = min_temp_storage(
485  stream.max_pattern_length(),
486  stream.max_text_length(),
487  stream.size() );
488 
490  if (temp == NULL)
491  {
492  temp_size = nvbio::max( min_temp_size, temp_size );
493  temp_vec.resize( temp_size );
494  temp = nvbio::plain_view( temp_vec );
495  }
496 
497  NVBIO_VAR_UNUSED static const uint32 WARP_SIZE = cuda::Arch::WARP_SIZE;
498 
499  // set the queue capacity based on available memory
500  const uint32 queue_capacity = align_down<WARP_SIZE>( uint32( temp_size / (align<WARP_SIZE>( stream.max_text_length() ) * sizeof(cell_type)) ) );
501 
502  const uint32 BLOCKWARPS = BLOCKDIM >> cuda::Arch::LOG_WARP_SIZE;
503  if (queue_capacity >= stream.size())
504  {
505  const uint32 n_warps = stream.size();
506  const uint32 n_blocks = (n_warps + BLOCKWARPS-1) / BLOCKWARPS;
507 
508  warp_batched_alignment_score_kernel<BLOCKDIM> <<<n_blocks, BLOCKDIM>>>(
509  stream,
510  (cell_type*)temp,
511  align<WARP_SIZE>( stream.size() ) ); // make sure everything is properly aligned
512  }
513  else
514  {
515  // compute the number of blocks we are going to launch
516  const uint32 n_blocks = nvbio::max( nvbio::min(
517  (uint32)cuda::max_active_blocks( warp_persistent_batched_alignment_score_kernel<BLOCKDIM,stream_type,cell_type>, BLOCKDIM, 0u ),
518  queue_capacity / BLOCKDIM ), 1u );
519 
520  warp_persistent_batched_alignment_score_kernel<BLOCKDIM> <<<n_blocks, BLOCKDIM>>>(
521  stream,
522  (cell_type*)temp,
523  queue_capacity );
524  }
525 }
526 
532 template <typename stream_type>
534 {
535  static const uint32 BLOCKDIM = 128;
536 
537  typedef typename stream_type::aligner_type aligner_type;
539 
542  static uint32 column_storage(const uint32 max_pattern_len, const uint32 max_text_len)
543  {
544  const uint32 column_size = equal<typename aligner_type::algorithm_tag,PatternBlockingTag>() ?
545  uint32( max_text_len * sizeof(cell_type) ) :
546  uint32( max_pattern_len * sizeof(cell_type) );
547 
548  return align<4>( column_size );
549  }
550 
553  static uint64 min_temp_storage(const uint32 max_pattern_len, const uint32 max_text_len, const uint32 stream_size)
554  {
555  return column_storage( max_pattern_len, max_text_len ) * 1024;
556  }
557 
560  static uint64 max_temp_storage(const uint32 max_pattern_len, const uint32 max_text_len, const uint32 stream_size)
561  {
562  return column_storage( max_pattern_len, max_text_len ) * stream_size;
563  }
564 
567  void enact(stream_type stream, uint64 temp_size = 0u, uint8* temp = NULL)
568  {
569  const uint64 min_temp_size = min_temp_storage(
570  stream.max_pattern_length(),
571  stream.max_text_length(),
572  stream.size() );
573 
575  if (temp == NULL)
576  {
577  temp_size = nvbio::max( min_temp_size, temp_size );
578  temp_vec.resize( temp_size );
579  temp = nvbio::plain_view( temp_vec );
580  }
581 
582  // set the queue capacity based on available memory
583  const uint32 max_pattern_len = stream.max_pattern_length();
584  const uint32 max_text_len = stream.max_text_length();
585  const uint32 queue_capacity = uint32( temp_size / column_storage( max_pattern_len, max_text_len ) );
586 
587  m_work_queue.set_capacity( queue_capacity );
588 
589  // prepare the work stream
590  ScoreStream<stream_type> score_stream(
591  stream,
592  temp,
593  NULL,
594  queue_capacity );
595 
596  // consume the work stream
597  m_work_queue.consume( score_stream );
598  }
599 
600 private:
604  BLOCKDIM> m_work_queue;
605 };
606 
607 // --- Traceback --------------------------------------------------------------------------------------------------------- //
608 
611 
612 template <uint32 CHECKPOINTS, typename stream_type, typename cell_type>
614 void batched_alignment_traceback(stream_type& stream, cell_type* checkpoints, uint32* submatrices, cell_type* columns, const uint32 stride, const uint32 work_id, const uint32 thread_id)
615 {
616  typedef typename stream_type::aligner_type aligner_type;
617  typedef typename stream_type::context_type context_type;
618  typedef typename stream_type::strings_type strings_type;
619 
620  // load the alignment context
621  context_type context;
622  if (stream.init_context( work_id, &context ) == false)
623  {
624  // handle the output
625  stream.output( work_id, &context );
626  return;
627  }
628 
629  // compute the end of the current DP matrix window
630  const uint32 pattern_len = stream.pattern_length( work_id, &context );
631 
632  // load the strings to be aligned
633  strings_type strings;
634  stream.load_strings( work_id, 0, pattern_len, &context, &strings );
635 
636  // fetch the proper checkpoint storage
637  typedef strided_iterator<cell_type*> checkpoint_type;
638  checkpoint_type checkpoint = checkpoint_type( checkpoints + thread_id, stride );
639 
640  // fetch the proper submatrix storage
641  typedef strided_iterator<uint32*> submatrix_storage_type;
642  submatrix_storage_type submatrix_storage = submatrix_storage_type( submatrices + thread_id, stride );
644  PackedStream<submatrix_storage_type,uint8,BITS,false> submatrix( submatrix_storage );
645 
646  // fetch the proper column storage
648  column_type column = column_type( columns + thread_id, stride );
649 
650  // score the current DP matrix window
651  context.alignment = alignment_traceback<CHECKPOINTS>(
652  stream.aligner(),
653  strings.pattern,
654  strings.quals,
655  strings.text,
656  context.min_score,
657  context.backtracer,
658  checkpoint,
659  submatrix,
660  column );
661 
662  // handle the output
663  stream.output( work_id, &context );
664 }
665 
666 template <uint32 BLOCKDIM, uint32 MINBLOCKS, uint32 CHECKPOINTS, typename stream_type, typename cell_type>
667 __global__ void
668 __launch_bounds__(BLOCKDIM,MINBLOCKS)
669 batched_alignment_traceback_kernel(stream_type stream, cell_type* checkpoints, uint32* submatrices, cell_type* columns, const uint32 stride)
670 {
671  const uint32 tid = blockIdx.x * BLOCKDIM + threadIdx.x;
672 
673  if (tid >= stream.size())
674  return;
675 
677 }
678 
679 template <uint32 BLOCKDIM, uint32 MINBLOCKS, uint32 CHECKPOINTS, typename stream_type, typename cell_type>
680 __global__ void
681 __launch_bounds__(BLOCKDIM,MINBLOCKS)
682 persistent_batched_alignment_traceback_kernel(stream_type stream, cell_type* checkpoints, uint32* submatrices, cell_type* columns, const uint32 stride)
683 {
684  const uint32 grid_threads = gridDim.x * BLOCKDIM;
685  const uint32 thread_id = threadIdx.x + blockIdx.x*BLOCKDIM;
686 
687  const uint32 stream_end = stream.size();
688 
689  // let this CTA fetch all tiles at a grid-threads stride, starting from blockIdx.x*BLOCKDIM
690  for (uint32 stream_begin = 0; stream_begin < stream_end; stream_begin += grid_threads)
691  {
692  const uint32 work_id = thread_id + stream_begin;
693 
694  if (work_id < stream_end)
696  }
697 }
698 
700 
706 template <uint32 BLOCKDIM, uint32 MINBLOCKS, uint32 CHECKPOINTS, typename stream_type>
707 struct BatchedAlignmentTraceback<CHECKPOINTS, stream_type,DeviceThreadBlockScheduler<BLOCKDIM,MINBLOCKS> >
708 {
709  typedef typename stream_type::aligner_type aligner_type;
711 
714  static uint32 column_storage(const uint32 max_pattern_len, const uint32 max_text_len)
715  {
716  const uint32 column_size = equal<typename aligner_type::algorithm_tag,PatternBlockingTag>() ?
717  uint32( max_text_len * sizeof(cell_type) ) :
718  uint32( max_pattern_len * sizeof(cell_type) );
719 
720  return align<4>( column_size );
721  }
722 
725  static uint32 checkpoint_storage(const uint32 max_pattern_len, const uint32 max_text_len)
726  {
727  if (equal<typename aligner_type::algorithm_tag,PatternBlockingTag>())
728  return align<4>( uint32( max_text_len * ((max_pattern_len + CHECKPOINTS-1) / CHECKPOINTS) * sizeof(cell_type) ) );
729  else
730  return align<4>( uint32( max_pattern_len * ((max_text_len + CHECKPOINTS-1) / CHECKPOINTS) * sizeof(cell_type) ) );
731  }
732 
735  static uint32 submatrix_storage(const uint32 max_pattern_len, const uint32 max_text_len)
736  {
737  if (equal<typename aligner_type::algorithm_tag,PatternBlockingTag>())
738  {
739  typedef typename stream_type::aligner_type aligner_type;
741  const uint32 ELEMENTS_PER_WORD = 32 / BITS;
742  return ((max_text_len * CHECKPOINTS + ELEMENTS_PER_WORD-1) / ELEMENTS_PER_WORD) * sizeof(uint32);
743  }
744  else
745  {
746  typedef typename stream_type::aligner_type aligner_type;
748  const uint32 ELEMENTS_PER_WORD = 32 / BITS;
749  return ((max_pattern_len * CHECKPOINTS + ELEMENTS_PER_WORD-1) / ELEMENTS_PER_WORD) * sizeof(uint32);
750  }
751  }
752 
755  static uint32 element_storage(const uint32 max_pattern_len, const uint32 max_text_len)
756  {
757  return column_storage( max_pattern_len, max_text_len ) +
758  checkpoint_storage( max_pattern_len, max_text_len ) +
759  submatrix_storage( max_pattern_len, max_text_len );
760  }
761 
764  static uint64 min_temp_storage(const uint32 max_pattern_len, const uint32 max_text_len, const uint32 stream_size);
765 
768  static uint64 max_temp_storage(const uint32 max_pattern_len, const uint32 max_text_len, const uint32 stream_size);
769 
772  void enact(stream_type stream, uint64 temp_size = 0u, uint8* temp = NULL);
773 };
774 
775 // return the minimum number of bytes required by the algorithm
776 //
777 template <uint32 BLOCKDIM, uint32 MINBLOCKS, uint32 CHECKPOINTS, typename stream_type>
779 {
780  return element_storage( max_pattern_len, max_text_len ) * 1024;
781 }
782 
783 // return the maximum number of bytes required by the algorithm
784 //
785 template <uint32 BLOCKDIM, uint32 MINBLOCKS, uint32 CHECKPOINTS, typename stream_type>
787 {
788  return element_storage( max_pattern_len, max_text_len ) * stream_size;
789 }
790 
791 // enact the batch execution
792 //
793 template <uint32 BLOCKDIM, uint32 MINBLOCKS, uint32 CHECKPOINTS, typename stream_type>
795 {
796  const uint64 min_temp_size = min_temp_storage(
797  stream.max_pattern_length(),
798  stream.max_text_length(),
799  stream.size() );
800 
802  if (temp == NULL)
803  {
804  temp_size = nvbio::max( min_temp_size, temp_size );
805  temp_vec.resize( temp_size );
806  temp = nvbio::plain_view( temp_vec );
807  }
808 
809  // set the queue capacity based on available memory
810  const uint32 max_pattern_len = stream.max_pattern_length();
811  const uint32 max_text_len = stream.max_text_length();
812  const uint32 queue_capacity = uint32( temp_size / element_storage( max_pattern_len, max_text_len ) );
813 
814  const uint64 column_size = column_storage( max_pattern_len, max_text_len );
815  const uint64 checkpoints_size = checkpoint_storage( max_pattern_len, max_text_len );
816 
817  if (queue_capacity >= stream.size())
818  {
819  const uint32 n_blocks = (stream.size() + BLOCKDIM-1) / BLOCKDIM;
820 
821  cell_type* checkpoints = (cell_type*)(temp);
822  cell_type* columns = (cell_type*)(temp + (checkpoints_size) * stream.size());
823  uint32* submatrices = (uint32*) (temp + (checkpoints_size + column_size) * stream.size());
824 
825  batched_alignment_traceback_kernel<BLOCKDIM,MINBLOCKS,CHECKPOINTS> <<<n_blocks, BLOCKDIM>>>(
826  stream,
827  checkpoints,
828  submatrices,
829  columns,
830  stream.size() );
831  }
832  else
833  {
834  // compute the number of blocks we are going to launch
835  const uint32 n_blocks = nvbio::max( nvbio::min(
836  (uint32)cuda::max_active_blocks( persistent_batched_alignment_traceback_kernel<BLOCKDIM,MINBLOCKS,CHECKPOINTS,stream_type,cell_type>, BLOCKDIM, 0u ),
837  queue_capacity / BLOCKDIM ), 1u );
838 
839  cell_type* checkpoints = (cell_type*)(temp);
840  cell_type* columns = (cell_type*)(temp + (checkpoints_size) * queue_capacity);
841  uint32* submatrices = (uint32*) (temp + (checkpoints_size + column_size) * queue_capacity);
842 
843  persistent_batched_alignment_traceback_kernel<BLOCKDIM,MINBLOCKS,CHECKPOINTS> <<<n_blocks, BLOCKDIM>>>(
844  stream,
845  checkpoints,
846  submatrices,
847  columns,
848  queue_capacity );
849  }
850 }
851 
852 namespace priv {
853 
854 //
855 // An alignment stream class to be used in conjunction with the BatchAlignmentScore class
856 //
857 template <
858  typename t_aligner_type,
859  typename pattern_set_type,
860  typename qualities_set_type,
861  typename text_set_type,
862  typename sink_iterator>
864 {
865  typedef t_aligner_type aligner_type;
866 
867  typedef typename pattern_set_type::string_type input_pattern_string;
868  typedef typename text_set_type::string_type input_text_string;
873  typedef typename qualities_set_type::string_type quals_string;
874  typedef typename std::iterator_traits<sink_iterator>::value_type sink_type;
875 
876  // an alignment context
878  {
881  };
882  // a container for the strings to be aligned
884  {
887 
891  };
892 
893  // constructor
896  aligner_type _aligner,
897  const uint32 _count,
898  const pattern_set_type _patterns,
899  const qualities_set_type _quals,
900  const text_set_type _texts,
901  sink_iterator _sinks,
902  const uint32 _max_pattern_length,
903  const uint32 _max_text_length) :
904  m_aligner ( _aligner ),
905  m_count (_count),
906  m_patterns (_patterns),
907  m_quals (_quals),
908  m_texts (_texts),
909  m_sinks (_sinks),
910  m_max_pattern_length ( _max_pattern_length ),
911  m_max_text_length ( _max_text_length ) {}
912 
913  // get the aligner
915  const aligner_type& aligner() const { return m_aligner; };
916 
917  // return the maximum pattern length
920 
921  // return the maximum text length
924 
925  // return the stream size
927  uint32 size() const { return m_count; }
928 
929  // return the i-th pattern's length
931  uint32 pattern_length(const uint32 i, context_type* context) const { return m_patterns[i].length(); }
932 
933  // return the i-th text's length
935  uint32 text_length(const uint32 i, context_type* context) const { return m_texts[i].length(); }
936 
937  // initialize the i-th context
940  const uint32 i,
941  context_type* context) const
942  {
943  // initialize the sink
944  context->sink = sink_type();
945 
946  context->min_score = Field_traits<int32>::min();
947  return true;
948  }
949 
950  // initialize the i-th context
953  const uint32 i,
954  const uint32 window_begin,
955  const uint32 window_end,
956  const context_type* context,
957  strings_type* strings) const
958  {
959  strings->pattern = strings->pattern_prefetcher.load( m_patterns[i] );
960  strings->quals = m_quals[i];
961  strings->text = strings->text_prefetcher.load( m_texts[i] );
962  }
963 
964  // handle the output
966  void output(
967  const uint32 i,
968  const context_type* context) const
969  {
970  // copy the sink
971  m_sinks[i] = context->sink;
972  }
973 
976  pattern_set_type m_patterns;
977  qualities_set_type m_quals;
978  text_set_type m_texts;
979  sink_iterator m_sinks;
982 };
983 
984 } // namespace priv
985 
986 //
987 // A convenience function for aligning a batch of patterns to a corresponding batch of texts.
988 //
989 template <
990  typename aligner_type,
991  typename pattern_set_type,
992  typename text_set_type,
993  typename sink_iterator,
994  typename scheduler_type>
996  const aligner_type aligner,
997  const pattern_set_type patterns,
998  const text_set_type texts,
999  sink_iterator sinks,
1000  const scheduler_type scheduler,
1001  const uint32 max_pattern_length,
1002  const uint32 max_text_length)
1003 {
1005 
1006  typedef aln::BatchedAlignmentScore<stream_type, scheduler_type> batch_type; // our batch type
1007 
1008  // create the stream
1009  stream_type stream(
1010  aligner,
1011  patterns.size(),
1012  patterns,
1014  texts,
1015  sinks,
1016  max_pattern_length,
1017  max_text_length );
1018 
1019  // enact the batch
1020  batch_type batch;
1021  batch.enact( stream );
1022 }
1023 
1024 //
1025 // A convenience function for aligning a batch of patterns to a corresponding batch of texts.
1026 //
1027 template <
1028  typename aligner_type,
1029  typename pattern_set_type,
1030  typename qualities_set_type,
1031  typename text_set_type,
1032  typename sink_iterator,
1033  typename scheduler_type>
1035  const aligner_type aligner,
1036  const pattern_set_type patterns,
1037  const qualities_set_type quals,
1038  const text_set_type texts,
1039  sink_iterator sinks,
1040  const scheduler_type scheduler,
1041  const uint32 max_pattern_length,
1042  const uint32 max_text_length)
1043 {
1045 
1046  typedef aln::BatchedAlignmentScore<stream_type, scheduler_type> batch_type; // our batch type
1047 
1048  // create the stream
1049  stream_type stream(
1050  aligner,
1051  patterns.size(),
1052  patterns,
1053  quals,
1054  texts,
1055  sinks,
1056  max_pattern_length,
1057  max_text_length );
1058 
1059  // enact the batch
1060  batch_type batch;
1061  batch.enact( stream );
1062 }
1063 
1064 //
1065 // A convenience function for aligning a batch of patterns to a corresponding batch of texts.
1066 //
1067 template <
1068  uint32 BAND_LEN,
1069  typename aligner_type,
1070  typename pattern_set_type,
1071  typename text_set_type,
1072  typename sink_iterator,
1073  typename scheduler_type>
1075  const aligner_type aligner,
1076  const pattern_set_type patterns,
1077  const text_set_type texts,
1078  sink_iterator sinks,
1079  const scheduler_type scheduler,
1080  const uint32 max_pattern_length,
1081  const uint32 max_text_length)
1082 {
1084 
1086 
1087  // create the stream
1088  stream_type stream(
1089  aligner,
1090  patterns.size(),
1091  patterns,
1093  texts,
1094  sinks,
1095  max_pattern_length,
1096  max_text_length );
1097 
1098  // enact the batch
1099  batch_type batch;
1100  batch.enact( stream );
1101 }
1102 
1104 
1106 
1107 } // namespace aln
1108 } // namespace nvbio