NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
alignment_inl.h
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #pragma once
29 
30 #include <nvbio/basic/types.h>
31 #include <nvbio/alignment/utils.h>
37 
38 #if defined(__CUDACC__)
42 #endif
43 
44 namespace nvbio {
45 namespace aln {
46 
47 // a function to compute the alignment score between a pattern and a text string
48 // with full DP alignment.
49 //
50 // \param aligner alignment algorithm
51 // \param pattern pattern string
52 // \param quals quality string
53 // \param text text string
54 // \param min_score threshold alignment score
55 // \param sink output sink
56 //
57 template <
58  typename aligner_type,
59  typename pattern_string,
60  typename qual_string,
61  typename text_string,
62  typename sink_type,
63  typename column_type>
66  const aligner_type aligner,
67  const pattern_string pattern,
68  const qual_string quals,
69  const text_string text,
70  const int32 min_score,
71  sink_type& sink,
73 {
75  return dispatcher::dispatch(
76  aligner,
77  pattern,
78  quals,
79  text,
80  min_score,
81  sink,
82  column );
83 }
84 
85 // a function to compute the alignment score between a pattern and a text string
86 // with full DP alignment.
87 //
88 // \param aligner alignment algorithm
89 // \param pattern pattern string
90 // \param quals quality string
91 // \param text text string
92 // \param min_score threshold alignment score
93 // \param sink output sink
94 //
95 template <
96  uint32 MAX_TEXT_LEN,
97  typename aligner_type,
98  typename pattern_string,
99  typename qual_string,
100  typename text_string,
101  typename sink_type>
104  const aligner_type aligner,
105  const pattern_string pattern,
106  const qual_string quals,
107  const text_string text,
108  const int32 min_score,
109  sink_type& sink)
110 {
111  typedef typename column_storage_type<aligner_type>::type cell_type;
112 
113  cell_type column[MAX_TEXT_LEN];
114 
116  return dispatcher::dispatch(
117  aligner,
118  pattern,
119  quals,
120  text,
121  min_score,
122  sink,
123  column );
124 }
125 
126 // a function to compute the alignment score between a pattern and a text string
127 // with full DP alignment in multiple passes, where each pass handles a window
128 // of the pattern and saves a checkpoint for the next.
129 //
130 // \param aligner alignment algorithm
131 // \param pattern pattern string
132 // \param quals quality string
133 // \param text text string
134 // \param min_score threshold alignment score
135 // \param sink output sink
136 //
137 template <
138  typename aligner_type,
139  typename pattern_string,
140  typename qual_string,
141  typename text_string,
142  typename sink_type,
143  typename column_type,
144  typename checkpoint_type>
147  const aligner_type aligner,
148  const pattern_string pattern,
149  const qual_string quals,
150  const text_string text,
151  const int32 min_score,
152  const uint32 window_begin,
153  const uint32 window_end,
154  sink_type& sink,
155  checkpoint_type checkpoint,
157 {
159  return dispatcher::dispatch(
160  aligner,
161  pattern,
162  quals,
163  text,
164  min_score,
165  window_begin,
166  window_end,
167  sink,
168  checkpoint,
169  column );
170 }
171 
172 // a function to compute the alignment score between a pattern and a text string
173 // with full DP alignment in multiple passes, where each pass handles a window
174 // of the pattern and saves a checkpoint for the next.
175 // The checkpoint is stored in the column itself.
176 //
177 // \param aligner alignment algorithm
178 // \param pattern pattern string
179 // \param quals quality string
180 // \param text text string
181 // \param min_score threshold alignment score
182 // \param sink output sink
183 //
184 template <
185  typename aligner_type,
186  typename pattern_string,
187  typename qual_string,
188  typename text_string,
189  typename sink_type,
190  typename column_type>
193  const aligner_type aligner,
194  const pattern_string pattern,
195  const qual_string quals,
196  const text_string text,
197  const int32 min_score,
198  const uint32 window_begin,
199  const uint32 window_end,
200  sink_type& sink,
202 {
204  return dispatcher::dispatch(
205  aligner,
206  pattern,
207  quals,
208  text,
209  min_score,
210  window_begin,
211  window_end,
212  sink,
213  column );
214 }
215 
216 //
217 // Calculate a set of checkpoints of the DP matrix for the alignment between a pattern
218 // and a text.
219 //
220 // \param aligner scoring scheme
221 // \param pattern pattern string (horizontal
222 // \param quals pattern qualities
223 // \param text text string (vertical)
224 // \param min_score minimum score
225 // \param sink output sink
226 // \param checkpoints the set of output checkpointed rows
227 //
228 template <
229  uint32 CHECKPOINTS,
230  typename aligner_type,
231  typename pattern_string,
232  typename qual_string,
233  typename text_string,
234  typename sink_type,
235  typename checkpoint_type,
236  typename column_type>
239  const aligner_type aligner,
240  const pattern_string pattern,
241  const qual_string quals,
242  const text_string text,
243  const int32 min_score,
244  sink_type& sink,
245  checkpoint_type checkpoints,
247 {
249  dispatcher::dispatch_checkpoints(
250  aligner,
251  pattern,
252  quals,
253  text,
254  min_score,
255  sink,
256  checkpoints,
257  column );
258 }
259 
260 //
261 // Compute the banded Dynamic Programming submatrix between two given checkpoints,
262 // storing its flow at each cell.
263 // The function returns the submatrix width.
264 //
265 // \tparam BAND_LEN size of the DP band
266 //
267 // \tparam CHECKPOINTS number of DP rows between each checkpoint
268 //
269 // \tparam checkpoint_type a class to represent the collection of checkpoints,
270 // represented as a linear array storing each checkpointed
271 // band contiguously.
272 // The class has to provide the const indexing operator[].
273 //
274 // \tparam submatrix_type a class to store the flow submatrix, represented
275 // as a linear array of size (BAND_LEN*CHECKPOINTS).
276 // The class has to provide the non-const indexing operator[].
277 // Note that the submatrix entries can assume only 3 values,
278 // and could hence be packed in 2 bits.
279 //
280 // \param aligner scoring scheme
281 // \param pattern pattern string (horizontal
282 // \param quals pattern qualities
283 // \param text text string (vertical)
284 // \param min_score minimum score
285 // \param checkpoints the set of checkpointed rows
286 // \param checkpoint_id the starting checkpoint used as the beginning of the submatrix
287 // \param submatrix the output submatrix
288 //
289 // \return the submatrix width
290 //
291 template <
292  uint32 CHECKPOINTS,
293  typename aligner_type,
294  typename pattern_string,
295  typename qual_string,
296  typename text_string,
297  typename column_type,
298  typename checkpoint_type,
299  typename submatrix_type>
302  const aligner_type aligner,
303  const pattern_string pattern,
304  const qual_string quals,
305  const text_string text,
306  const int32 min_score,
307  checkpoint_type checkpoints,
308  const uint32 checkpoint_id,
309  submatrix_type submatrix,
311 {
313  return dispatcher::dispatch_submatrix(
314  aligner,
315  pattern,
316  quals,
317  text,
318  min_score,
319  checkpoints,
320  checkpoint_id,
321  submatrix,
322  column );
323 }
324 
325 //
326 // Backtrace an optimal alignment using a full DP algorithm.
327 //
328 // \tparam CHECKPOINTS number of DP rows between each checkpoint
329 // \tparam aligner_type an \ref Aligner "Aligner" algorithm
330 // \tparam pattern_string a string representing the pattern.
331 // \tparam qual_string an array representing the pattern qualities.
332 // \tparam text_string a string representing the text.
333 // \tparam checkpoints_type an array-like class defining operator[], used to represent a reduced DP score matrix,
334 // containing all the matrix columns whose index is a multiple of CHECKPOINTS;
335 // the type of the matrix cells depends on the aligner, and can be obtained as
336 // typename checkpoints_storage_type<aligner_type>::type
337 // \tparam submatrix_type an array-like class defining operator[], used to represent a temporary DP flow submatrix,
338 // containing all the matrix flow cells between two checkpointed columns.
339 // the number of bits needed for the submatrix cells depends on the aligner, and can be obtained as
340 // direction_vector_traits<aligner_type>::BITS
341 // \tparam backtracer_type a model of \ref Backtracer.
342 //
343 // \param aligner alignment algorithm
344 // \param pattern pattern to be aligned
345 // \param quals pattern quality scores
346 // \param text text to align the pattern to
347 // \param min_score minimum accepted score
348 // \param backtracer backtracking delegate
349 // \param checkpoints temporary checkpoints storage
350 // \param submatrix temporary submatrix storage
351 //
352 // \return reported alignment
353 //
354 template <
355  uint32 CHECKPOINTS,
356  typename aligner_type,
357  typename pattern_string,
358  typename qual_string,
359  typename text_string,
360  typename backtracer_type,
361  typename checkpoints_type,
362  typename submatrix_type,
363  typename column_type>
366  const aligner_type aligner,
367  const pattern_string pattern,
368  const qual_string quals,
369  const text_string text,
370  const int32 min_score,
371  backtracer_type& backtracer,
372  checkpoints_type checkpoints,
373  submatrix_type submatrix,
375 {
376  //
377  // This function performs backtracing in a completely generic fashion that works
378  // for any DP based aligner.
379  // It does so using 2 module-wide generic functions:
380  // * alignment_score_checkpoints()
381  // * alignment_score_submatrix()
382  //
383  // and one aligner-specific function:
384  // * priv::alignment_traceback()
385  //
386 
387  //
388  // TODO: use the following strategy: perform two scoring passes, one to find the end,
389  // and one (backwards, reversing both sequences) to find the beginning of the alignment
390  // (if the traceback function was passed an exact end cell, the first pass could be skipped).
391  // At that point one can isolate a much a smaller submatrix in which to perform checkpointing,
392  // and the following algorithm can be employed.
393  //
394 
395  typedef typename pattern_string::value_type symbol_type;
396 
397  // the submatrix height is equal to the text length (remember the DP matrix has the pattern as rows and the text as columns)
398  const uint32 submatrix_height = text.length();
399 
400  // compute a set of checkpoints along the pattern
401  BestSink<int32> best;
402  alignment_score_checkpoints<CHECKPOINTS>(
403  aligner, pattern, quals, text, min_score, best, checkpoints, column );
404 
405  const uint32 n_checkpoints = (pattern.length() + CHECKPOINTS-1)/CHECKPOINTS;
406 
407  // check whether we found a valid alignment
408  uint2 sink = best.sink;
409  if (sink.x == uint32(-1) ||
410  sink.y == uint32(-1))
411  return Alignment<int32>( best.score, make_uint2( uint32(-1), uint32(-1) ), make_uint2( uint32(-1), uint32(-1) ) );
412 
413  // clip the end of the pattern, in case the alignment terminated early
414  backtracer.clip( pattern.length() - sink.y );
415 
416  // find the checkpoint containing the sink
417  int32 checkpoint_id = n_checkpoints-1;
418 
419  if (aligner_type::TYPE == LOCAL)
420  {
421  for (; checkpoint_id >= 0; --checkpoint_id)
422  {
423  if (checkpoint_id * CHECKPOINTS < sink.y)
424  break;
425  }
426  }
427 
428  //store state (H, E, or F) between checkpoints
429  uint8 state = HSTATE;
430 
431  // backtrack until needed
432  for (; checkpoint_id >= 0; --checkpoint_id)
433  {
434  const uint32 submatrix_width = alignment_score_submatrix<CHECKPOINTS>(
435  aligner, pattern, quals, text, min_score, checkpoints, checkpoint_id, submatrix, column );
436 
437  if (priv::alignment_traceback<CHECKPOINTS>(
438  aligner, checkpoints, checkpoint_id, submatrix, submatrix_width, submatrix_height, state, sink, backtracer ))
439  break;
440  }
441 
442  // finish backtracking along the first row and first column (not explicitly stored)
443  // until we get to a cell containing the value zero.
445  {
446  if (sink.x == 0)
447  {
448  for (; sink.y > 0; --sink.y)
449  backtracer.push( INSERTION );
450  }
451  }
452  if (aligner_type::TYPE == GLOBAL)
453  {
454  if (sink.y == 0)
455  {
456  for (; sink.x > 0; --sink.x)
457  backtracer.push( DELETION );
458  }
459  }
460 
461  // clip the beginning of the alignment
462  backtracer.clip( sink.y );
463 
464  return Alignment<int32>( best.score, sink, best.sink );
465 }
466 
467 //
468 // Backtrace an optimal alignment using a full DP algorithm.
469 //
470 // \tparam MAX_PATTERN_LEN maximum pattern length
471 // \tparam MAX_TEXT_LEN maximum text length
472 // \tparam CHECKPOINTS number of DP rows between each checkpoint
473 // \tparam aligner_type an \ref Aligner "Aligner" algorithm
474 // \tparam pattern_string a string representing the pattern.
475 // \tparam qual_string an array representing the pattern qualities.
476 // \tparam text_string a string representing the text.
477 // \tparam backtracer_type a model of \ref Backtracer.
478 //
479 // \param aligner alignment algorithm
480 // \param pattern pattern to be aligned
481 // \param quals pattern quality scores
482 // \param text text to align the pattern to
483 // \param min_score minimum accepted score
484 // \param backtracer backtracking delegate
485 //
486 // \return reported alignment
487 //
488 template <
489  uint32 MAX_PATTERN_LEN,
490  uint32 MAX_TEXT_LEN,
491  uint32 CHECKPOINTS,
492  typename aligner_type,
493  typename pattern_string,
494  typename qual_string,
495  typename text_string,
496  typename backtracer_type>
499  const aligner_type aligner,
500  const pattern_string pattern,
501  const qual_string quals,
502  const text_string text,
503  const int32 min_score,
504  backtracer_type& backtracer)
505 {
506  typedef typename checkpoint_storage_type<aligner_type>::type checkpoint_type;
507  typedef typename column_storage_type<aligner_type>::type cell_type;
508 
509  cell_type column[MAX_TEXT_LEN];
511  const uint32 ELEMENTS_PER_WORD = 32 / BITS;
512 
513  checkpoint_type checkpoints[ (MAX_TEXT_LEN+1)*((MAX_PATTERN_LEN + CHECKPOINTS-1)/CHECKPOINTS) ];
514 
515  NVBIO_VAR_UNUSED const uint32 SUBMATRIX_WORDS = (MAX_TEXT_LEN*CHECKPOINTS + ELEMENTS_PER_WORD-1) / ELEMENTS_PER_WORD;
516  uint32 submatrix_base[ SUBMATRIX_WORDS ];
517  PackedStream<uint32*,uint8,BITS,false> submatrix( submatrix_base );
518 
519  return alignment_traceback<CHECKPOINTS>(
520  aligner,
521  pattern,
522  quals,
523  text,
524  min_score,
525  backtracer,
526  checkpoints,
527  submatrix,
528  column );
529 }
530 
531 #if defined(__CUDACC__)
532 namespace warp {
533 
534 // Compute the alignment score between a pattern and a text string with full DP alignment using a warp.
535 //
536 // This is a low-level function, requiring all needed temporary storage to be passed from the caller.
537 // The purpose is allowing the caller to allocate such storage (possibly among kernel threads) using different
538 // strategies.
539 //
540 // \tparam aligner_type an \ref Aligner "Aligner" algorithm
541 // \tparam pattern_string a string representing the pattern.
542 // \tparam qual_string an array representing the pattern qualities.
543 // \tparam text_string a string representing the text.
544 // \tparam column_type an array-like class defining operator[], used to represent a partial matrix column;
545 // the type of the matrix cells depends on the aligner, and can be obtained as
546 // typename column_storage_type<aligner_type>::type;
547 //
548 // \param aligner alignment algorithm
549 // \param pattern pattern string
550 // \param quals quality string
551 // \param text text string
552 // \param min_score threshold alignment score
553 // \param sink output sink
554 // \param column temporary storage for a matrix column, must be at least as large as the text
555 //
556 template <
558  typename aligner_type,
559  typename pattern_string,
560  typename qual_string,
561  typename text_string,
562  typename column_type>
565  const aligner_type aligner,
566  const pattern_string pattern,
567  const qual_string quals,
568  const text_string text,
569  const int32 min_score,
570  uint2* sink,
572 {
573  return priv::alignment_score<BLOCKDIM>(
574  aligner,
575  pattern,
576  quals,
577  text,
578  min_score,
579  sink,
580  column );
581 }
582 
583 } // namespace warp
584 #endif
585 
586 } // namespace alignment
587 } // namespace nvbio