NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
batched_banded_inl.h
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #pragma once
29 
30 #include <nvbio/basic/types.h>
32 #include <nvbio/alignment/utils.h>
36 
37 namespace nvbio {
38 namespace aln {
39 
42 
43 template <uint32 BAND_LEN, typename stream_type>
45 void batched_banded_alignment_score(const stream_type& stream, const uint32 work_id)
46 {
47  typedef typename stream_type::aligner_type aligner_type;
48  typedef typename stream_type::context_type context_type;
49  typedef typename stream_type::strings_type strings_type;
50 
51  // load the alignment context
52  context_type context;
53  if (stream.init_context( work_id, &context ) == true)
54  {
55  // compute the end of the current DP matrix window
56  const uint32 len = equal<typename aligner_type::algorithm_tag,TextBlockingTag>() ?
57  stream.text_length( work_id, &context ) :
58  stream.pattern_length( work_id, &context );
59 
60  // load the strings to be aligned
61  strings_type strings;
62  stream.load_strings( work_id, 0, len, &context, &strings );
63 
64  // score the current DP matrix window
65  banded_alignment_score<BAND_LEN>(
66  stream.aligner(),
67  strings.pattern,
68  strings.quals,
69  strings.text,
70  context.min_score,
71  context.sink );
72  }
73 
74  // handle the output
75  stream.output( work_id, &context );
76 }
77 
78 template <uint32 BLOCKDIM, uint32 MINBLOCKS, uint32 BAND_LEN, typename stream_type>
79 __global__ void
81 batched_banded_alignment_score_kernel(const stream_type stream)
82 {
83  const uint32 tid = blockIdx.x * blockDim.x + threadIdx.x;
84  if (tid >= stream.size())
85  return;
86 
87  batched_banded_alignment_score<BAND_LEN>( stream, tid );
88 }
89 
91 
97 template <uint32 BAND_LEN, typename stream_type>
99 {
100  static const uint32 BLOCKDIM = 128;
101 
102  typedef typename stream_type::aligner_type aligner_type;
104 
107  static uint64 min_temp_storage(const uint32 max_pattern_len, const uint32 max_text_len, const uint32 stream_size) { return 0u; }
108 
111  static uint64 max_temp_storage(const uint32 max_pattern_len, const uint32 max_text_len, const uint32 stream_size) { return 0u; }
112 
115  void enact(stream_type stream, uint64 temp_size = 0u, uint8* temp = NULL);
116 };
117 
118 // enact the batch execution
119 //
120 template <uint32 BAND_LEN, typename stream_type>
122 {
123  #if defined(_OPENMP)
124  #pragma omp parallel for
125  #endif
126  for (int tid = 0; tid < int( stream.size() ); ++tid)
127  batched_banded_alignment_score<BAND_LEN>( stream, tid );
128 }
129 
135 template <uint32 BLOCKDIM, uint32 MINBLOCKS, uint32 BAND_LEN, typename stream_type>
136 struct BatchedBandedAlignmentScore<BAND_LEN,stream_type,DeviceThreadBlockScheduler<BLOCKDIM,MINBLOCKS> >
137 {
138  typedef typename stream_type::aligner_type aligner_type;
140 
143  static uint64 min_temp_storage(const uint32 max_pattern_len, const uint32 max_text_len, const uint32 stream_size) { return 0u; }
144 
147  static uint64 max_temp_storage(const uint32 max_pattern_len, const uint32 max_text_len, const uint32 stream_size) { return 0u; }
148 
151  void enact(stream_type stream, uint64 temp_size = 0u, uint8* temp = NULL);
152 };
153 
154 // enact the batch execution
155 //
156 template <uint32 BLOCKDIM, uint32 MINBLOCKS, uint32 BAND_LEN, typename stream_type>
158 {
159  const uint32 n_blocks = (stream.size() + BLOCKDIM-1) / BLOCKDIM;
160 
161  batched_banded_alignment_score_kernel<BLOCKDIM,MINBLOCKS,BAND_LEN> <<<n_blocks, BLOCKDIM>>>( stream );
162 }
163 
164 
170 template <uint32 BAND_LEN, typename stream_type>
172 {
173  static const uint32 BLOCKDIM = 128;
174 
175  typedef typename stream_type::aligner_type aligner_type;
177 
180  static uint32 column_storage(const uint32 max_pattern_len, const uint32 max_text_len)
181  {
182  const uint32 column_size = uint32( BAND_LEN * sizeof(cell_type) );
183 
184  return align<4>( column_size );
185  }
186 
189  static uint64 min_temp_storage(const uint32 max_pattern_len, const uint32 max_text_len, const uint32 stream_size)
190  {
191  return column_storage( max_pattern_len, max_text_len ) * 1024;
192  }
193 
196  static uint64 max_temp_storage(const uint32 max_pattern_len, const uint32 max_text_len, const uint32 stream_size)
197  {
198  return column_storage( max_pattern_len, max_text_len ) * stream_size;
199  }
200 
203  void enact(stream_type stream, uint64 temp_size = 0u, uint8* temp = NULL)
204  {
205  const uint64 min_temp_size = min_temp_storage(
206  stream.max_pattern_length(),
207  stream.max_text_length(),
208  stream.size() );
209 
210  thrust::device_vector<uint8> temp_dvec;
211  if (temp == NULL)
212  {
213  temp_size = nvbio::max( min_temp_size, temp_size );
214  temp_dvec.resize( temp_size );
215  temp = nvbio::device_view( temp_dvec );
216  }
217 
218  // set the queue capacity based on available memory
219  const uint32 max_pattern_len = stream.max_pattern_length();
220  const uint32 max_text_len = stream.max_text_length();
221  const uint32 queue_capacity = uint32( temp_size / column_storage( max_pattern_len, max_text_len ) );
222 
223  m_work_queue.set_capacity( queue_capacity );
224 
225  // prepare the work stream
226  ScoreStream<stream_type> score_stream(
227  stream, // the alignments stream
228  temp, // band storage
229  NULL, // no need for checkpoints
230  queue_capacity ); // the queue capacity, used for the memory striding
231 
232  // consume the work stream
233  m_work_queue.consume( score_stream );
234  }
235 
236 private:
240  BLOCKDIM> m_work_queue;
241 };
242 
243 // --- Banded Traceback --------------------------------------------------------------------------------------------------------- //
244 
247 
248 template <uint32 BAND_LEN, uint32 CHECKPOINTS, typename stream_type, typename cell_type>
251 {
252  typedef typename stream_type::aligner_type aligner_type;
253  typedef typename stream_type::context_type context_type;
254  typedef typename stream_type::strings_type strings_type;
255 
256  // load the alignment context
257  context_type context;
258  if (stream.init_context( work_id, &context ) == false)
259  {
260  // handle the output
261  stream.output( work_id, &context );
262  return;
263  }
264 
265  // compute the end of the current DP matrix window
266  const uint32 len = equal<typename aligner_type::algorithm_tag,PatternBlockingTag>() ?
267  stream.pattern_length( work_id, &context ) :
268  stream.text_length( work_id, &context );
269 
270  // load the strings to be aligned
271  strings_type strings;
272  stream.load_strings( work_id, 0, len, &context, &strings );
273 
274  // fetch the proper checkpoint storage
275  typedef strided_iterator<cell_type*> checkpoint_type;
276  checkpoint_type checkpoint = checkpoint_type( checkpoints + thread_id, stride );
277 
278  // fetch the proper submatrix storage
279  typedef strided_iterator<uint32*> submatrix_storage_type;
280  submatrix_storage_type submatrix_storage = submatrix_storage_type( submatrices + thread_id, stride );
282  PackedStream<submatrix_storage_type,uint8,BITS,false> submatrix( submatrix_storage );
283 
284  // score the current DP matrix window
285  context.alignment = banded_alignment_traceback<BAND_LEN, CHECKPOINTS>(
286  stream.aligner(),
287  strings.pattern,
288  strings.quals,
289  strings.text,
290  context.min_score,
291  context.backtracer,
292  checkpoint,
293  submatrix );
294 
295  // handle the output
296  stream.output( work_id, &context );
297 }
298 
299 template <uint32 BLOCKDIM, uint32 BAND_LEN, uint32 CHECKPOINTS, typename stream_type, typename cell_type>
301 {
302  const uint32 tid = blockIdx.x * BLOCKDIM + threadIdx.x;
303 
304  if (tid >= stream.size())
305  return;
306 
307  batched_banded_alignment_traceback<BAND_LEN, CHECKPOINTS>( stream, checkpoints, submatrices, stride, tid, tid );
308 }
309 
310 template <uint32 BLOCKDIM, uint32 BAND_LEN, uint32 CHECKPOINTS, typename stream_type, typename cell_type>
312 {
313  const uint32 grid_threads = gridDim.x * BLOCKDIM;
314  const uint32 thread_id = threadIdx.x + blockIdx.x*BLOCKDIM;
315 
316  const uint32 stream_end = stream.size();
317 
318  // let this CTA fetch all tiles at a grid-threads stride, starting from blockIdx.x*BLOCKDIM
319  for (uint32 stream_begin = 0; stream_begin < stream_end; stream_begin += grid_threads)
320  {
321  const uint32 work_id = thread_id + stream_begin;
322 
323  if (work_id < stream_end)
324  batched_banded_alignment_traceback<BAND_LEN, CHECKPOINTS>( stream, checkpoints, submatrices, stride, work_id, thread_id );
325  }
326 }
327 
329 
335 template <uint32 BAND_LEN, uint32 CHECKPOINTS, typename stream_type>
336 struct BatchedBandedAlignmentTraceback<BAND_LEN,CHECKPOINTS, stream_type,DeviceThreadScheduler>
337 {
338  static const uint32 BLOCKDIM = 128;
339 
340  typedef typename stream_type::aligner_type aligner_type;
342 
345  static uint32 checkpoint_storage(const uint32 max_pattern_len, const uint32 max_text_len)
346  {
347  return align<4>( uint32( BAND_LEN * ((max_pattern_len + CHECKPOINTS-1) / CHECKPOINTS) * sizeof(cell_type) ) );
348  }
349 
352  static uint32 submatrix_storage(const uint32 max_pattern_len, const uint32 max_text_len)
353  {
354  typedef typename stream_type::aligner_type aligner_type;
356  const uint32 ELEMENTS_PER_WORD = 32 / BITS;
357  return ((BAND_LEN * CHECKPOINTS + ELEMENTS_PER_WORD-1) / ELEMENTS_PER_WORD) * sizeof(uint32);
358  }
359 
362  static uint32 element_storage(const uint32 max_pattern_len, const uint32 max_text_len)
363  {
364  return checkpoint_storage( max_pattern_len, max_text_len ) +
365  submatrix_storage( max_pattern_len, max_text_len );
366  }
367 
370  static uint64 min_temp_storage(const uint32 max_pattern_len, const uint32 max_text_len, const uint32 stream_size);
371 
374  static uint64 max_temp_storage(const uint32 max_pattern_len, const uint32 max_text_len, const uint32 stream_size);
375 
378  void enact(stream_type stream, uint64 temp_size = 0u, uint8* temp = NULL);
379 };
380 
381 // return the minimum number of bytes required by the algorithm
382 //
383 template <uint32 BAND_LEN, uint32 CHECKPOINTS, typename stream_type>
385 {
386  return element_storage( max_pattern_len, max_text_len ) * 1024;
387 }
388 
389 // return the maximum number of bytes required by the algorithm
390 //
391 template <uint32 BAND_LEN, uint32 CHECKPOINTS, typename stream_type>
393 {
394  return element_storage( max_pattern_len, max_text_len ) * stream_size;
395 }
396 
397 // enact the batch execution
398 //
399 template <uint32 BAND_LEN, uint32 CHECKPOINTS, typename stream_type>
401 {
402  const uint64 min_temp_size = min_temp_storage(
403  stream.max_pattern_length(),
404  stream.max_text_length(),
405  stream.size() );
406 
407  thrust::device_vector<uint8> temp_dvec;
408  if (temp_size == 0u)
409  {
410  temp_dvec.resize( min_temp_size );
411  temp = nvbio::device_view( temp_dvec );
412  temp_size = min_temp_size;
413  }
414 
415  // set the queue capacity based on available memory
416  const uint32 max_pattern_len = stream.max_pattern_length();
417  const uint32 max_text_len = stream.max_text_length();
418  const uint32 queue_capacity = uint32( temp_size / element_storage( max_pattern_len, max_text_len ) );
419 
420  const uint64 checkpoints_size = checkpoint_storage( max_pattern_len, max_text_len );
421 
422  if (queue_capacity >= stream.size())
423  {
424  const uint32 n_blocks = (stream.size() + BLOCKDIM-1) / BLOCKDIM;
425 
426  cell_type* checkpoints = (cell_type*)(temp);
427  uint32* submatrices = (uint32*) (temp + checkpoints_size * stream.size());
428 
429  batched_banded_alignment_traceback_kernel<BLOCKDIM,BAND_LEN,CHECKPOINTS> <<<n_blocks, BLOCKDIM>>>(
430  stream,
431  checkpoints,
432  submatrices,
433  stream.size() );
434  }
435  else
436  {
437  // compute the number of blocks we are going to launch
438  const uint32 n_blocks = nvbio::max( nvbio::min(
439  (uint32)cuda::max_active_blocks( persistent_banded_batched_alignment_traceback_kernel<BLOCKDIM,BAND_LEN,CHECKPOINTS,stream_type,cell_type>, BLOCKDIM, 0u ),
440  queue_capacity / BLOCKDIM ), 1u );
441 
442  cell_type* checkpoints = (cell_type*)(temp);
443  uint32* submatrices = (uint32*) (temp + checkpoints_size * queue_capacity);
444 
445  persistent_banded_batched_alignment_traceback_kernel<BLOCKDIM,BAND_LEN,CHECKPOINTS> <<<n_blocks, BLOCKDIM>>>(
446  stream,
447  checkpoints,
448  submatrices,
449  queue_capacity );
450  }
451 }
452 
454 
456 
457 } // namespace alignment
458 } // namespace nvbio