45 typename scoring_type,
52 const scoring_type& scoring,
53 const string_type str,
54 const qual_type quals,
56 const int32 min_score,
60 #if __CUDA_ARCH__ >= 350
61 typedef int32 score_type;
68 const uint32 M = str.length();
69 const uint32 N = ref.length();
71 const score_type SCORE_TEXT_GAP_OPEN = scoring.text_gap_open();
72 const score_type SCORE_TEXT_GAP_EXTEND = scoring.text_gap_extension();
73 const score_type SCORE_PATTERN_GAP_OPEN = scoring.pattern_gap_open();
74 const score_type SCORE_PATTERN_GAP_EXTEND = scoring.pattern_gap_extension();
77 score_type h_top, h_left, h_diag, hi;
79 score_type f_left, fi;
82 alignment best_alignment = alignment::minimum_value();
89 score_type temp_cache_h, temp_cache_f;
90 uint8 reference_cache;
96 const uint32 wi = warp_tid() + 1;
99 for(
uint32 i = warp_tid(); i < N; i += WARP_SIZE)
101 temp[i] = make_short2(
102 (
TYPE ==
GLOBAL ? SCORE_TEXT_GAP_OPEN + SCORE_TEXT_GAP_EXTEND * i : 0),
106 for(
uint32 warp_block = 0; warp_block < M; warp_block += WARP_SIZE)
109 warp_block_width = (warp_block + WARP_SIZE >= M ? M % WARP_SIZE : WARP_SIZE);
111 const uint32 i = wi + warp_block;
114 h_top = (
TYPE !=
LOCAL ? SCORE_PATTERN_GAP_OPEN + SCORE_PATTERN_GAP_EXTEND * (i - 1) : 0);
118 h_diag = (
TYPE !=
LOCAL ? SCORE_PATTERN_GAP_OPEN + SCORE_PATTERN_GAP_EXTEND * (i - 2) : 0);
121 const uint8 s_i = (i <= M ? str[i - 1] : 0);
122 const uint8 q_i = (i <= M ? quals[i - 1] : 0);
128 for(
uint32 block_diag = 2; block_diag <= warp_block_width + N; block_diag += WARP_SIZE)
131 const uint32 thread_j = (block_diag - 2) + warp_tid();
135 temp_cache_h = temp[thread_j].x;
136 temp_cache_f = temp[thread_j].y;
137 reference_cache = ref[(block_diag - 2) + warp_tid()];
144 for(
uint32 diag = block_diag; diag < block_diag + WARP_SIZE; diag++)
149 const uint32 j = diag - wi;
152 if (wi <= diag_len && j <= N)
157 r_j = reference_cache;
159 h_left = temp_cache_h;
160 f_left = temp_cache_f;
164 const score_type S_ij = (r_j == s_i) ? scoring.match(q_i) : scoring.mismatch(q_i);
166 ei =
nvbio::max(e_top + SCORE_TEXT_GAP_EXTEND,
167 h_top + SCORE_TEXT_GAP_OPEN);
168 fi =
nvbio::max(f_left + SCORE_PATTERN_GAP_EXTEND,
169 h_left + SCORE_PATTERN_GAP_OPEN);
183 temp[j - 1] = make_short2( hi, fi );
194 if (hi > best_alignment.score)
195 best_alignment = alignment(hi, make_uint2(j, i));
207 r_j = __shfl_up(r_j, 1);
209 h_left = __shfl_up(hi, 1);
210 f_left = __shfl_up(fi, 1);
213 temp_cache_h = __shfl_down(temp_cache_h, 1);
214 temp_cache_f = __shfl_down(temp_cache_f, 1);
215 reference_cache = __shfl_down(reference_cache, 1);
219 if (warp_block + WARP_SIZE < M)
223 max_score = __shfl( max_score, WARP_SIZE - 1 );
225 const score_type missing_cols = score_type(M - warp_block - WARP_SIZE);
226 if (max_score + missing_cols * scoring.match(255) < score_type( min_score ))
234 __shared__
volatile alignment sm_red [WARP_SIZE * NUM_WARPS * 2];
235 volatile alignment *sm_warp_red = sm_red + WARP_SIZE * warp_id() * 2;
236 cuda::scan<32>(best_alignment, alignment::max_operator(), alignment::minimum_value(), sm_warp_red);
237 best_alignment = cuda::scan_total<32>(sm_warp_red);
242 best_alignment.score = __shfl(hi, warp_block_width - 1);
243 best_alignment.sink = make_uint2(N,M);
246 *sink = best_alignment.sink;
247 return best_alignment.score;
258 typename scoring_type,
259 typename pattern_string,
260 typename qual_string,
261 typename text_string,
266 const pattern_string pattern,
267 const qual_string quals,
268 const text_string
text,
269 const int32 min_score,
273 #if defined(NVBIO_DEVICE_COMPILATION)
274 return gotoh_alignment_score<BLOCKDIM,TYPE>(