NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
aligner_best_exact.h
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 namespace nvbio {
29 namespace bowtie2 {
30 namespace cuda {
31 
32 template <
33  typename FMI,
34  typename rFMI,
35  typename scoring_scheme_type>
36 void Aligner::best_exact(
37  const Params& params,
38  const FMI fmi,
39  const rFMI rfmi,
40  const scoring_scheme_type& scoring_scheme,
41  const io::FMIndexDataCUDA& driver_data,
42  io::ReadDataCUDA& read_data,
43  const uint32 output_set,
44  Stats& stats)
45 {
46  typedef uint32 read_storage_type;
47  typedef uint4 read_storage_type4;
49  typedef READ_TEX_SELECTOR( const uint4*, nvbio::cuda::ldg_pointer<uint4> ) read_base_type4;
50 
51  typedef read_storage_type read_storage_type_A;
52  typedef read_base_type read_base_type_A;
53 
54  #if USE_UINT4_PACKING
55  typedef read_storage_type4 read_storage_type_B;
56  typedef read_base_type4 read_base_type_B;
57  #else
58  typedef read_storage_type read_storage_type_B;
59  typedef read_base_type read_base_type_B;
60  #endif
61 
62  Timer timer;
63  Timer global_timer;
64 
65  const uint32 count = read_data.size();
66  const uint32 band_len = band_length( params.max_dist );
67 
69  //ScopedLock lock( &output_thread.m_lock[ output_set ] );
71  thrust::make_counting_iterator(0u),
72  thrust::make_counting_iterator(0u) + count,
73  seed_queue_dvec.begin() );
74 
75  seed_queue_size_dvec[0] = count;
76 
77  //
78  // Similarly to Bowtie2, we perform a number of seed & extension passes, except
79  // that here we extract just one seed per pass.
80  // Since reads may have different length, and the search might for some might terminate
81  // earlier than for others, the number of reads actively processed in each pass can vary
82  // substantially.
83  // In order to keep the cores and all its lanes busy, we use a pair of input & output
84  // queues to compact the set of active reads in each round, swapping them at each
85  // iteration.
86  //
87 
88  // initialize seeding queues
89  HostQueues<uint32> seed_queues;
90  seed_queues.in_size = count;
91  seed_queues.in_queue = seed_queue_dptr;
92  seed_queues.out_size = NULL; // for mapping, we initially setup a NULL output queue
93  seed_queues.out_queue = NULL; // for mapping, we initially setup a NULL output queue
94 
95  // initialize the seed hit counts
96  thrust::fill( cnts_dvec.begin(), cnts_dvec.end(), uint32(0) );
97 
98  Batch<read_base_type_A,const char*> reads_A(
99  0, count,
100  read_base_type_A( (const read_storage_type_A*)read_data.read_stream() ),
101  read_data.read_index(),
102  read_data.qual_stream());
103 
104  //
105  // perform mapping
106  //
107  {
108  timer.start();
109 
110  #if 0
111  bowtie2_map_exact(
112  reads_A, fmi, rfmi,
113  0u, seed_queues.device(),
114  hits_dptr, cnts_dptr,
115  params );
116  #else
117  bowtie2_map_exact(
118  reads_A, fmi, rfmi,
119  hits_dptr, cnts_dptr,
120  params );
121  #endif
122  cudaThreadSynchronize();
123  nvbio::cuda::check_error("mapping kernel");
124 
125  timer.stop();
126  stats.map.add( seed_queues.in_size, timer.seconds() );
127  }
128 
129  // take some stats on the hits we got
130  if (params.keep_stats)
131  keep_stats( reads_A.size(), stats );
132 
133  // setup output queue
134  seed_queues.out_size = seed_queue_size_dptr;
135  seed_queues.out_queue = seed_queue_dptr + BATCH_SIZE;
136 
137  const uint32 max_seeds = read_data.m_max_read_len / params.seed_len;
138 
139  thrust::device_vector<SeedHit> hits_level_dvec( BATCH_SIZE );
140  thrust::device_vector<uint32> cnts_level_dvec( BATCH_SIZE );
141 
142  for (uint32 seed_idx = 0; seed_queues.in_size && seed_idx < max_seeds; ++seed_idx)
143  {
144  {
145  const uint32 blocks = (read_data.size() + BLOCKDIM-1) / BLOCKDIM;
146 
147  thrust::fill( hits_stats_dvec.begin(), hits_stats_dvec.end(), 0u );
148  bowtie2_hits_stats( read_data.size(), hits_dptr, cnts_dptr, hits_stats_dptr );
149 
150  cudaThreadSynchronize();
151  nvbio::cuda::check_error("hit stats kernel");
152 
153  cudaThreadSynchronize();
154  nvbio::cuda::check_error("hit stats kernel");
155 
157 
158  const uint64 n_hits = hits_stats_hvec[ HIT_STATS_TOTAL ];
159  fprintf(stderr, "\nseed idx: %u (%u active reads - %.2f M global hits)\n", seed_idx, seed_queues.in_size, float(n_hits) * 1.0e-6f);
160  }
161 
162  extract_top_range(
163  reads_A, hits_dptr, cnts_dptr,
164  thrust::raw_pointer_cast( &hits_level_dvec.front() ),
165  thrust::raw_pointer_cast( &cnts_level_dvec.front() ) );
166 
167  cudaThreadSynchronize();
168  nvbio::cuda::check_error("extract-top-range kernel");
169 
170  best_exact_score(
171  params,
172  fmi,
173  rfmi,
174  scoring_scheme,
175  driver_data,
176  read_data,
177  seed_idx,
178  seed_queues,
179  thrust::raw_pointer_cast( &hits_level_dvec.front() ),
180  thrust::raw_pointer_cast( &cnts_level_dvec.front() ),
181  stats );
182 
183  // initialize output seeding queue size
184  seed_queues.clear_output();
185 
186  bowtie2_prune_search(
187  seed_idx,
188  seed_queues.device(),
189  best_data_dptr );
190 
191  cudaThreadSynchronize();
192  nvbio::cuda::check_error("pruning kernel");
193 
194  // swap input & output queues
195  seed_queues.swap();
196  }
197 
198  //
199  // At this point, for each read we have the scores and rough alignment positions of the
200  // best two alignments: to compute the final results we need to backtrack the DP extension,
201  // and compute accessory CIGARS and MD strings.
202  //
203 
204  Batch<read_base_type_B,const char*> reads_B(
205  0, count,
206  read_base_type_B( (const read_storage_type_B*)read_data.read_stream() ),
207  read_data.read_index(),
208  read_data.qual_stream());
209 
210  //
211  // perform backtracking and compute cigars for the best alignments
212  //
213  timer.start();
214  {
215  BacktrackBestContext<0,edit_distance_scoring> context(
217  NULL );
218 
219  bowtie2_backtrack(
220  read_data.max_read_len(),
221  band_len,
222  count,
223  reads_B,
224  scoring_scheme,
225  driver_data.genome_length(),
226  driver_data.genome_stream(),
227  cigar_dptr,
229  mds_dptr,
230  count,
231  context,
232  params );
233 
234  cudaThreadSynchronize();
235  nvbio::cuda::check_error("backtracking kernel");
236  }
237  timer.stop();
238  stats.backtrack.add( count, timer.seconds() );
239 
240  // copy best cigars back to the host
241  cigar_coords_hvec1[output_set] = cigar_coords_dvec;
242  nvbio::cuda::copy( cigar_dvec, cigar_hvec1[output_set] );
243  nvbio::cuda::copy( mds_dvec, mds_hvec1[output_set] );
244 
245  // overlap the second-best indices with the loc queue
246  thrust::device_vector<uint32>& second_idx_dvec = loc_queue_dvec;
247 
248  // compact the indices of the second-best alignments
249  const uint32 n_second = uint32( thrust::copy_if(
250  thrust::make_counting_iterator(0u),
251  thrust::make_counting_iterator(0u) + count,
252  best_data_dvec.begin(),
253  second_idx_dvec.begin(),
254  has_second() ) - second_idx_dvec.begin() );
255 
256  //
257  // perform backtracking and compute cigars for the second-best alignments
258  //
259  if (n_second)
260  {
261  timer.start();
262 
263  uint32* second_idx = thrust::raw_pointer_cast( &second_idx_dvec[0] );
264 
265  BacktrackBestContext<1,edit_distance_scoring> context(
267  second_idx );
268 
269  bowtie2_backtrack(
270  read_data.max_read_len(),
271  band_len,
272  n_second,
273  reads_B,
274  scoring_scheme,
275  driver_data.genome_length(),
276  driver_data.genome_stream(),
277  cigar_dptr,
279  mds_dptr,
280  count,
281  context,
282  params );
283 
284  cudaThreadSynchronize();
285  nvbio::cuda::check_error("second-best backtracking kernel");
286 
287  timer.stop();
288  stats.backtrack.add( n_second, timer.seconds() );
289  }
290 
291  // copy second-best cigars back to the host
292  cigar_coords_hvec2[output_set] = cigar_coords_dvec;
293  nvbio::cuda::copy( cigar_dvec, cigar_hvec2[output_set] );
294  nvbio::cuda::copy( mds_dvec, mds_hvec2[output_set] );
295 
296  // copy results back to the host
297  thrust::copy( best_data_dvec.begin(), best_data_dvec.begin() + count, best_data_hvec[output_set].begin() );
298 }
299 
300 template <
301  typename FMI,
302  typename rFMI,
303  typename scoring_scheme_type>
304 void Aligner::best_exact_score(
305  const Params& params,
306  const FMI fmi,
307  const rFMI rfmi,
308  const scoring_scheme_type& scoring_scheme,
309  const io::FMIndexDataCUDA& driver_data,
310  io::ReadDataCUDA& read_data,
311  const uint32 seed_idx,
312  HostQueues<uint32>& seed_queues,
313  SeedHit* hits_level_dptr,
314  uint32* cnts_level_dptr,
315  Stats& stats)
316 {
317  #if USE_UINT4_PACKING
318  typedef uint32 read_storage_type;
319  #else
320  typedef uint4 read_storage_type;
321  #endif
322  typedef READ_TEX_SELECTOR( const read_storage_type*, nvbio::cuda::ldg_pointer<read_storage_type> ) read_base_type;
323 
324  uint32 hits_to_process;
325  {
326  thrust::fill( hits_stats_dvec.begin(), hits_stats_dvec.end(), 0u );
327  bowtie2_hits_stats( read_data.size(), hits_dptr, cnts_dptr, hits_stats_dptr );
328 
329  cudaThreadSynchronize();
330  nvbio::cuda::check_error("hit stats kernel");
331 
333 
334  hits_to_process = hits_stats_hvec[ HIT_STATS_TOTAL ];
335  }
336 
337  Timer timer;
338  Timer global_timer;
339 
340  const uint32 count = read_data.size();
341  const uint32 band_len = band_length( seed_idx );
342 
343  Batch<read_base_type,const char*> reads(
344  0, count,
345  read_base_type( (const read_storage_type*)read_data.read_stream() ),
346  read_data.read_index(),
347  read_data.qual_stream());
348 
349  //
350  // At this point we have a queue full of reads, each with an associated set of
351  // seed hits encoded as a (sorted) list of SA ranges.
352  // For each read we need to:
353  // 1. select some seed hit to process (i.e. a row in one of the SA ranges)
354  // 2. locate it, i.e. converting from SA to linear coordinates
355  // 3. and score it
356  // until some search criteria are satisfied.
357  // The output queue is then reused in the next round as the input queue, and
358  // viceversa.
359  //
360  HostQueues<uint32> hit_queues;
361  hit_queues.in_size = seed_queues.in_size;
362  hit_queues.in_queue = hits_queue_dptr;
363  hit_queues.out_size = hits_queue_size_dptr;
364  hit_queues.out_queue = hits_queue_dptr + BATCH_SIZE;
365 
366  cudaMemcpy( hit_queues.in_queue, seed_queues.in_queue, sizeof(uint32) * hit_queues.in_size, cudaMemcpyDeviceToDevice );
367 
368  bool first_run = true;
369 
370  SelectBestExactContext select_context(
372  params.max_dist,
373  seed_idx,
374  seed_idx == 0 ? true : false ); // init
375 
376  // use the score queue to store temporary packed seed data
377  uint32* seed_data_dptr = (uint32*)score_queue_dptr;
378 
379  uint32 processed_hits = 0;
380 
381  while (hit_queues.in_size)
382  {
383  // sort the input queue
384  thrust::sort(
385  hits_queue_dvec.begin() + (hit_queues.in_queue - hits_queue_dptr),
386  hits_queue_dvec.begin() + (hit_queues.in_queue - hits_queue_dptr) + hit_queues.in_size );
387 
388  fprintf(stderr,"\rcount: %u (%.2f / %.2f M hits processed) ", hit_queues.in_size, float(processed_hits) * 1.0e-6f, float(hits_to_process) * 1.0e-6f);
389  hit_queues.clear_output();
390 
391  /*if (hit_queues.in_size < read_data.size() / 500)
392  {
393  // we can safely ignore this small set of active reads, as we'll drop only
394  // a given amount of sensitivity
395  return;
396  }
397  else*/ if (hit_queues.in_size > 128u*1024u)
398  {
399  //
400  // The queue of actively processed reads is sufficiently large to allow
401  // selecting & scoring one seed hit at a time and still have large kernel
402  // launches. This is algorithmically the most efficient choice (because
403  // it enables frequent early outs), so let's choose it.
404  //
405  timer.start();
406 
407  //
408  // select and locate the next round of hits for the reads in the input queue
409  //
410  bowtie2_select(
411  first_run,
412  reads, fmi, rfmi, hits_level_dptr, cnts_level_dptr,
413  select_context,
414  hit_queues.in_size,
415  hit_queues.in_queue,
416  hit_queues.out_size,
417  hit_queues.out_queue,
418  loc_queue_dptr,
419  seed_data_dptr,
420  params );
421 
422  first_run = false;
423 
424  cudaThreadSynchronize();
425  nvbio::cuda::check_error("selecting kernel");
426 
427  timer.stop();
428  stats.select.add( hit_queues.in_size, timer.seconds() );
429 
430  // fetch the new queue size
431  hit_queues.swap();
432  if (hit_queues.in_size == 0)
433  break;
434 
435  processed_hits += hit_queues.in_size;
436 
437  timer.start();
438 
439  // sort the selected hits by their SA coordinate
440  thrust::copy(
441  thrust::make_counting_iterator(0u),
442  thrust::make_counting_iterator(0u) + hit_queues.in_size,
443  idx_queue_dvec.begin() );
444 
445  thrust::sort_by_key(
446  loc_queue_dvec.begin(),
447  loc_queue_dvec.begin() + hit_queues.in_size,
448  idx_queue_dvec.begin() );
449 
450  timer.stop();
451  stats.sort.add( hit_queues.in_size, timer.seconds() );
452 
453  // NOTE: only 75-80% of these locations are unique.
454  // It might pay off to do a compaction beforehand.
455 
456  // and locate their position in linear coordinates
457  bowtie2_locate(
458  reads, fmi, rfmi,
459  hit_queues.in_size,
460  loc_queue_dptr,
462  seed_data_dptr,
463  params );
464 
465  cudaThreadSynchronize();
466  nvbio::cuda::check_error("locating kernel");
467 
468  timer.stop();
469  stats.locate.add( hit_queues.in_size, timer.seconds() );
470 
471  timer.start();
472 
473  // sort the selected hits by their linear genome coordinate
474  // TODO: sub-sort by read position/RC flag so as to (1) get better coherence,
475  // (2) allow removing duplicate extensions
476  thrust::sort_by_key(
477  loc_queue_dvec.begin(),
478  loc_queue_dvec.begin() + hit_queues.in_size,
479  idx_queue_dvec.begin() );
480 
481  timer.stop();
482  stats.sort.add( hit_queues.in_size, timer.seconds() );
483 
484  //
485  // assign a score to all selected hits (currently in the output queue)
486  //
487  if (hit_queues.in_size)
488  {
489  timer.start();
490 
491  ScoreBestContext<edit_distance_scoring> context(
492  NULL,
493  hit_queues.in_queue,
494  score_queue_dptr,
496  seed_idx );
497 
498  // compute the score for each selected hit (one thread per hit).
499  bowtie2_score(
500  band_len,
501  reads,
502  context,
503  hit_queues.in_size,
505  loc_queue_dptr,
506  driver_data.genome_length(),
507  driver_data.genome_stream(),
508  params );
509 
510  cudaThreadSynchronize();
511  nvbio::cuda::check_error("score kernel");
512 
513  const ReduceBestExactContext reduce_context;
514 
515  bowtie2_score_reduce<edit_distance_scoring>(
516  reads, reduce_context, hits_level_dptr, cnts_level_dptr,
518  hit_queues.in_size,
519  hit_queues.in_queue,
520  loc_queue_dptr,
521  score_queue_dptr,
522  params );
523 
524  cudaThreadSynchronize();
525  nvbio::cuda::check_error("score-reduce kernel");
526 
527  timer.stop();
528  stats.score.add( hit_queues.in_size, timer.seconds() );
529  }
530  }
531  else
532  {
533  //
534  // The queue of actively processed reads is very small: at this point
535  // it's better to select multiple hits to process in each round.
536  // This adds some book keeping overheads, but allows to make larger
537  // kernel launches.
538  //
539  timer.start();
540 
541  score_output_count_dvec[0] = 0;
542 
543  // the maximum number of extensions we can perform in one iteration
544  // is at most 4096, as we use 12 bits to encode the extension index
545  const uint32 max_ext = 4096;
546 
547  // try to generate 256K items to process
548  const uint32 n_multi =
549  std::min(
550  (256u*1024u) / hit_queues.in_size,
551  max_ext );
552 
553  const uint32 score_output_queue_stride = hit_queues.in_size;
554 
555  bowtie2_select_multi(
556  first_run,
557  reads, fmi, rfmi, hits_level_dptr, cnts_level_dptr,
558  select_context,
559  hit_queues.in_size,
560  hit_queues.in_queue,
561  hit_queues.out_size,
562  hit_queues.out_queue,
563  score_output_count_dptr,
564  loc_queue_dptr,
565  seed_data_dptr,
566  score_parent_queue_dptr,
567  score_output_queue_dptr,
568  score_output_queue_stride,
569  n_multi,
570  params );
571 
572  first_run = false;
573 
574  cudaThreadSynchronize();
575  nvbio::cuda::check_error("select-multi kernel");
576 
577  timer.stop();
578  stats.select.add( hit_queues.in_size * n_multi, timer.seconds() );
579 
580  // swap the input & output queues
581  hit_queues.swap();
582 
583  // fetch the output queue size
584  const uint32 score_queue_size = score_output_count_dvec[0];
585  if (score_queue_size == 0)
586  break;
587 
588  processed_hits += score_queue_size;
589 
590  timer.start();
591 
592  // sort the selected hits by their SA coordinate
593  thrust::copy(
594  thrust::make_counting_iterator(0u),
595  thrust::make_counting_iterator(0u) + score_queue_size,
596  idx_queue_dvec.begin() );
597 
598  thrust::sort_by_key(
599  loc_queue_dvec.begin(),
600  loc_queue_dvec.begin() + score_queue_size,
601  idx_queue_dvec.begin() );
602 
603  timer.stop();
604  stats.sort.add( score_queue_size, timer.seconds() );
605 
606  // and locate their position in linear coordinates
607  bowtie2_locate(
608  reads, fmi, rfmi,
609  score_queue_size,
610  loc_queue_dptr,
612  seed_data_dptr,
613  params );
614 
615  cudaThreadSynchronize();
616  nvbio::cuda::check_error("locating kernel");
617 
618  timer.stop();
619  stats.locate.add( score_queue_size, timer.seconds() );
620 
621  timer.start();
622 
623  // sort the selected hits by their linear genome coordinate
624  // TODO: sub-sort by read_id/RC flag so as to (1) get better coherence,
625  // (2) allow removing duplicate extensions
626  thrust::sort_by_key(
627  loc_queue_dvec.begin(),
628  loc_queue_dvec.begin() + score_queue_size,
629  idx_queue_dvec.begin() );
630 
631  timer.stop();
632  stats.sort.add( hit_queues.in_size, timer.seconds() );
633 
634  //
635  // assign a score to all selected hits (currently in the output queue)
636  //
637  if (score_queue_size)
638  {
639  timer.start();
640 
641  ScoreBestContext<edit_distance_scoring> context(
642  score_parent_queue_dptr,
643  hit_queues.in_queue,
644  score_queue_dptr,
645  best_data_dptr,
646  seed_idx );
647 
648  // compute the score for each selected hit (one thread per hit).
649  bowtie2_score(
650  band_len,
651  reads,
652  context,
653  score_queue_size,
655  loc_queue_dptr,
656  driver_data.genome_length(),
657  driver_data.genome_stream(),
658  params );
659 
660  cudaThreadSynchronize();
661  nvbio::cuda::check_error("score-multi kernel");
662 
663  const ReduceBestExactContext reduce_context;
664 
665  // reduce the multiple scores to find the best two alignments
666  // (one thread per active read).
667  bowtie2_score_multi_reduce<edit_distance_scoring>(
668  reads, reduce_context, hits_level_dptr, cnts_level_dptr,
670  hit_queues.in_size,
671  hit_queues.in_queue,
672  score_output_queue_stride,
673  score_output_queue_dptr,
674  score_parent_queue_dptr,
675  score_queue_dptr,
676  loc_queue_dptr,
677  params );
678 
679  cudaThreadSynchronize();
680  nvbio::cuda::check_error("score-multi-reduce kernel");
681 
682  timer.stop();
683  stats.score.add( score_queue_size, timer.seconds() );
684  }
685  }
686  }
687 }
688 
689 } // namespace cuda
690 } // namespace bowtie2
691 } // namespace nvbio