NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
gotoh_warp_inl.h
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #pragma once
29 
30 #include <nvbio/basic/cuda/arch.h>
31 #include <nvbio/basic/cuda/scan.h>
32 #include <nvbio/alignment/sink.h>
33 #include <nvbio/alignment/utils.h>
36 
37 
38 namespace nvbio {
39 namespace aln {
40 namespace priv {
41 
42 template <
45  typename scoring_type,
46  typename string_type,
47  typename qual_type,
48  typename ref_type,
49  typename column_type>
52  const scoring_type& scoring,
53  const string_type str,
54  const qual_type quals,
55  const ref_type ref,
56  const int32 min_score,
57  uint2* sink,
58  column_type temp)
59 {
60 #if __CUDA_ARCH__ >= 350
61  typedef int32 score_type;
62  typedef alignment_result<score_type> alignment;
63 
64 
65  const uint32 WARP_SIZE = 1u << cuda::Arch::LOG_WARP_SIZE;
67 
68  const uint32 M = str.length();
69  const uint32 N = ref.length();
70 
71  const score_type SCORE_TEXT_GAP_OPEN = scoring.text_gap_open();
72  const score_type SCORE_TEXT_GAP_EXTEND = scoring.text_gap_extension();
73  const score_type SCORE_PATTERN_GAP_OPEN = scoring.pattern_gap_open();
74  const score_type SCORE_PATTERN_GAP_EXTEND = scoring.pattern_gap_extension();
75 
76  // local scores
77  score_type h_top, h_left, h_diag, hi;
78  score_type e_top, ei;
79  score_type f_left, fi;
80 
81  // local maximum score and corresponding sink
82  alignment best_alignment = alignment::minimum_value();
83 
84  // current reference string character
85  uint8 r_j;
86 
87  // per-thread cache for temp values and reference string characters
88  // each thread loads a different value; cache values are shuffled down the warp at each iteration
89  score_type temp_cache_h, temp_cache_f;
90  uint8 reference_cache;
91 
92  // width of the current warp-block stripe of the DP matrix (always WARP_SIZE except for the last stripe)
93  uint32 warp_block_width;
94 
95  // compute warp-block horizontal coordinate in DP matrix for this thread
96  const uint32 wi = warp_tid() + 1;
97 
98  // initialize the leftmost matrix column
99  for(uint32 i = warp_tid(); i < N; i += WARP_SIZE)
100  {
101  temp[i] = make_short2(
102  (TYPE == GLOBAL ? SCORE_TEXT_GAP_OPEN + SCORE_TEXT_GAP_EXTEND * i : 0),
103  (TYPE != LOCAL ? Field_traits<int16>::min() : 0) );
104  }
105 
106  for(uint32 warp_block = 0; warp_block < M; warp_block += WARP_SIZE)
107  {
108  // width of this block
109  warp_block_width = (warp_block + WARP_SIZE >= M ? M % WARP_SIZE : WARP_SIZE);
110  // compute the horizontal coordinate of the current thread in the DP matrix (including border column)
111  const uint32 i = wi + warp_block;
112 
113  // set top boundary values
114  h_top = (TYPE != LOCAL ? SCORE_PATTERN_GAP_OPEN + SCORE_PATTERN_GAP_EXTEND * (i - 1) : 0);
115  e_top = (TYPE != LOCAL ? Field_traits<int16>::min() : 0);
116 
117  // initialize diagonal
118  h_diag = (TYPE != LOCAL ? SCORE_PATTERN_GAP_OPEN + SCORE_PATTERN_GAP_EXTEND * (i - 2) : 0);
119 
120  // load the query string character for the current thread
121  const uint8 s_i = (i <= M ? str[i - 1] : 0);
122  const uint8 q_i = (i <= M ? quals[i - 1] : 0);
123 
124  // initialize the best score for this stripe
125  score_type max_score = Field_traits<score_type>::min();
126 
127  // loop over all DP anti-diagonals, excluding the border row/column
128  for(uint32 block_diag = 2; block_diag <= warp_block_width + N; block_diag += WARP_SIZE)
129  {
130  // reload caches every WARP_SIZE diagonals
131  const uint32 thread_j = (block_diag - 2) + warp_tid();
132 
133  if (thread_j < N)
134  {
135  temp_cache_h = temp[thread_j].x;
136  temp_cache_f = temp[thread_j].y;
137  reference_cache = ref[(block_diag - 2) + warp_tid()];
138  } else {
139  temp_cache_h = 0;
140  temp_cache_f = 0;
141  reference_cache = 0;
142  }
143 
144  for(uint32 diag = block_diag; diag < block_diag + WARP_SIZE; diag++)
145  {
146  // compute the length of this anti-diagonal (excluding border row/column)
147  const uint32 diag_len = nvbio::min3(diag - 1, WARP_SIZE, warp_block_width);
148  // compute vertical coordinate of the current cell in the DP matrix (including border column)
149  const uint32 j = diag - wi;
150 
151  // is the current cell inside the DP matrix?
152  if (wi <= diag_len && j <= N)
153  {
154  if (wi == 1)
155  {
156  // load new temp and reference values
157  r_j = reference_cache;
158  // initialize cell to the left of the current cell
159  h_left = temp_cache_h;
160  f_left = temp_cache_f;
161  }
162 
163  // compute the match/mismatch score
164  const score_type S_ij = (r_j == s_i) ? scoring.match(q_i) : scoring.mismatch(q_i);
165 
166  ei = nvbio::max(e_top + SCORE_TEXT_GAP_EXTEND,
167  h_top + SCORE_TEXT_GAP_OPEN);
168  fi = nvbio::max(f_left + SCORE_PATTERN_GAP_EXTEND,
169  h_left + SCORE_PATTERN_GAP_OPEN);
170  hi = nvbio::max3(h_diag + S_ij,
171  ei,
172  fi);
173 
174  if (TYPE == LOCAL)
175  {
176  // clamp score to zero
177  hi = nvbio::max(hi, score_type(0));
178  }
179 
180  // save off the last column
181  if (wi == WARP_SIZE)
182  {
183  temp[j - 1] = make_short2( hi, fi );
184 
185  // keep track of the best score in this stripe
186  max_score = nvbio::max( max_score, hi );
187  }
188 
189  // save the best score across the entire matrix for local scoring
190  // save the best score across the last column for semi-global scoring
191  if (TYPE == LOCAL ||
192  (TYPE == SEMI_GLOBAL && i == M))
193  {
194  if (hi > best_alignment.score)
195  best_alignment = alignment(hi, make_uint2(j, i));
196  }
197 
198  // current left becomes diagonal for next iteration on this lane
199  h_diag = h_left;
200 
201  // current value becomes h_top for next iteration on this lane
202  h_top = hi;
203  e_top = ei;
204  }
205 
206  // move previous cell reference value across the warp
207  r_j = __shfl_up(r_j, 1);
208  // hi becomes h_left on the next lane
209  h_left = __shfl_up(hi, 1);
210  f_left = __shfl_up(fi, 1);
211 
212  // push temp_cache and reference_cache values down the warp
213  temp_cache_h = __shfl_down(temp_cache_h, 1);
214  temp_cache_f = __shfl_down(temp_cache_f, 1);
215  reference_cache = __shfl_down(reference_cache, 1);
216  }
217  }
218 
219  if (warp_block + WARP_SIZE < M)
220  {
221  // we are now (M - warp_block - WARP_SIZE) columns away from the last one: check whether
222  // we could theoretically reach the minimum score
223  max_score = __shfl( max_score, WARP_SIZE - 1 );
224 
225  const score_type missing_cols = score_type(M - warp_block - WARP_SIZE);
226  if (max_score + missing_cols * scoring.match(255) < score_type( min_score ))
227  return Field_traits<int32>::min();
228  }
229  }
230 
231  if (TYPE == LOCAL || TYPE == SEMI_GLOBAL)
232  {
233  // do a warp-wide max-scan to find the largest score (TODO: use a reduction instead)
234  __shared__ volatile alignment sm_red [WARP_SIZE * NUM_WARPS * 2];
235  volatile alignment *sm_warp_red = sm_red + WARP_SIZE * warp_id() * 2;
236  cuda::scan<32>(best_alignment, alignment::max_operator(), alignment::minimum_value(), sm_warp_red);
237  best_alignment = cuda::scan_total<32>(sm_warp_red);
238  }
239 
240  if (TYPE == GLOBAL)
241  {
242  best_alignment.score = __shfl(hi, warp_block_width - 1);
243  best_alignment.sink = make_uint2(N,M);
244  }
245 
246  *sink = best_alignment.sink;
247  return best_alignment.score;
248 #else
249  // unsupported on compute capability < 3.5
250  return 0;
251 #endif
252 }
253 
254 // private dispatcher for the warp-parallel version of the Gotoh aligner
255 template <
258  typename scoring_type,
259  typename pattern_string,
260  typename qual_string,
261  typename text_string,
262  typename column_type>
265  const GotohAligner<TYPE,scoring_type> aligner,
266  const pattern_string pattern,
267  const qual_string quals,
268  const text_string text,
269  const int32 min_score,
270  uint2* sink,
272 {
273 #if defined(NVBIO_DEVICE_COMPILATION)
274  return gotoh_alignment_score<BLOCKDIM,TYPE>(
275  aligner.scheme,
276  pattern,
277  quals,
278  text,
279  min_score,
280  sink,
281  column );
282 #else
283  return Field_traits<int32>::min();
284 #endif
285 }
286 
287 } // namespace priv
288 } // namespace aln
289 } // namespace nvbio