NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
aligner_best_approx.h
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
38 #include <nvbio/basic/cuda/ldg.h>
39 #include <nvbio/basic/primitives.h>
40 
43 
44 namespace nvbio {
45 namespace bowtie2 {
46 namespace cuda {
47 
48 template <typename MapqCalculator, typename ReadData>
50 {
52  typedef uint8 result_type;
53 
55  MapqCalculator _mapq_eval,
56  const io::Alignment* _best_data,
57  const uint32 _best_stride,
58  const ReadData _read_data) :
59  mapq_eval( _mapq_eval ),
60  best_data( _best_data ),
61  best_stride( _best_stride ),
62  read_data( _read_data ) {}
63 
65  uint8 operator() (const uint32 read_id) const
66  {
67  const io::BestAlignments best( best_data[ read_id ], best_data[ read_id + best_stride ] );
68 
69  const uint32 read_offset = read_data.sequence_index()[ read_id ];
70  const uint32 read_len = read_data.sequence_index()[ read_id + 1 ] - read_offset;
71 
72  return uint8( mapq_eval(
74  read_len,
75  read_len ) );
76  }
77 
78  const MapqCalculator mapq_eval;
81  const ReadData read_data;
82 };
83 
84 template <typename scoring_tag>
86  const Params& params,
87  const fmi_type fmi,
88  const rfmi_type rfmi,
89  const UberScoringScheme& input_scoring_scheme,
90  const io::SequenceDataDevice& reference_data,
91  const io::FMIndexDataDevice& driver_data,
92  const io::SequenceDataDevice& read_data,
93  io::HostOutputBatchSE& cpu_batch,
94  Stats& stats)
95 {
96  // cast the genome to use proper iterators
97  const genome_view_type genome_view( plain_view( reference_data ) );
98  const genome_access_type genome_access( genome_view );
99  const uint32 genome_len = genome_access.bps();
100  const genome_iterator genome = genome_access.sequence_stream();
101 
102  // prepare the scoring system
103  typedef typename ScoringSchemeSelector<scoring_tag>::type scoring_scheme_type;
104  typedef typename scoring_scheme_type::threshold_score_type threshold_score_type;
105 
106  scoring_scheme_type scoring_scheme = ScoringSchemeSelector<scoring_tag>::scheme( input_scoring_scheme );
107 
108  threshold_score_type threshold_score = scoring_scheme.threshold_score();
109  //const int32 score_limit = scoring_scheme_type::worst_score;
110 
111  // start timing
112  Timer timer;
113  Timer global_timer;
114  nvbio::cuda::Timer device_timer;
115 
116  const uint32 count = read_data.size();
117  const uint32 band_len = band_length( params.max_dist );
118 
119  // create a device-side read batch
120  const read_view_type reads_view = plain_view( read_data );
121  const read_batch_type reads( reads_view );
122 
123  // initialize best-alignments
124  init_alignments( reads, threshold_score, best_data_dptr, BATCH_SIZE );
125 
126  seed_queues.resize( count );
127 
128  thrust::copy(
129  thrust::make_counting_iterator(0u),
130  thrust::make_counting_iterator(0u) + count,
131  seed_queues.in_queue.begin() );
132 
133  //
134  // Similarly to Bowtie2, we perform a number of seed & extension passes.
135  // Whether a read is re-seeded is determined at run time based on seed hit and
136  // alignment statistics.
137  // Hence, the number of reads actively processed in each pass can vary substantially.
138  // In order to keep the cores and all its lanes busy, we use a pair of input & output
139  // queues to compact the set of active reads in each round, swapping them at each
140  // iteration.
141  //
142 
143  // filter away reads that map exactly
144  if (0)
145  {
146  // initialize output seeding queue size
148 
149  //
150  // perform whole read mapping
151  //
152  {
153  timer.start();
154  device_timer.start();
155 
156  // initialize the seed hit counts
158 
160 
161  log_debug(stderr, "[%u] map\n", ID);
163  reads, fmi, rfmi,
165  reseed_dptr,
166  hits,
167  params,
168  params.fw,
169  params.rc );
170 
172  nvbio::cuda::check_error("mapping kernel");
173 
174  device_timer.stop();
175  timer.stop();
176  stats.map.add( seed_queues.in_size, timer.seconds(), device_timer.seconds() );
177  }
178 
179  best_approx_score<scoring_tag>(
180  params,
181  fmi,
182  rfmi,
183  scoring_scheme,
184  reference_data,
185  driver_data,
186  read_data,
187  uint32(-1),
190  stats );
191 
192  log_verbose( stderr, "[%u] %.1f %% reads map exactly\n", ID, 100.0f * float(count - seed_queues.output_size())/float(count) );
193 
194  // copy the reads that need reseeding
198  reseed_dptr,
200  temp_dvec );
201 
202  // swap input & output queues
203  seed_queues.swap();
204  }
205 
206  for (uint32 seeding_pass = 0; seeding_pass < params.max_reseed+1; ++seeding_pass)
207  {
208  // check whether the input queue is empty
209  if (seed_queues.in_size == 0)
210  break;
211 
212  // initialize output seeding queue size
214 
215  //
216  // perform mapping
217  //
218  {
219  timer.start();
220  device_timer.start();
221 
223 
225 
226  log_debug(stderr, "[%u] map\n", ID);
227  map(
228  reads, fmi, rfmi,
229  seeding_pass, seed_queues.device_view(),
230  reseed_dptr,
231  hits,
232  params,
233  params.fw,
234  params.rc );
235 
237  nvbio::cuda::check_error("mapping kernel");
238 
239  device_timer.stop();
240  timer.stop();
241  stats.map.add( seed_queues.in_size, timer.seconds(), device_timer.seconds() );
242 
243  // check if we need to persist this seeding pass
244  if (params.persist_batch == -1 || batch_number == (uint32) params.persist_batch &&
245  params.persist_seeding == -1 || seeding_pass == (uint32) params.persist_seeding)
246  persist_hits( params.persist_file, "hits", 0u, count, hit_deques );
247  }
248 
249  // take some stats on the hits we got
250  if (seeding_pass == 0 && params.keep_stats)
251  keep_stats( reads.size(), stats );
252 
253  best_approx_score<scoring_tag>(
254  params,
255  fmi,
256  rfmi,
257  scoring_scheme,
258  reference_data,
259  driver_data,
260  read_data,
261  seeding_pass,
264  stats );
265 
266  // mark unaligned reads
271  reseed_dptr );
272 
273  // copy the reads that need reseeding
277  reseed_dptr,
279  temp_dvec );
280 
281  // swap input & output queues
282  seed_queues.swap();
283  }
284 
285  //
286  // At this point, for each read we have the scores and rough alignment positions of the
287  // best two alignments: to compute the final results we need to backtrack the DP extension,
288  // and compute accessory CIGARS and MD strings.
289  //
290 
291  // compute mapq
292  {
293  log_debug(stderr, "[%u] compute mapq\n", ID);
294  typedef BowtieMapq2< SmithWatermanScoringScheme<> > mapq_evaluator_type;
295 
296  mapq_evaluator_type mapq_eval( input_scoring_scheme.sw );
297 
298  MapqFunctorSE<mapq_evaluator_type,read_view_type> mapq_functor( mapq_eval, best_data_dptr, BATCH_SIZE, reads_view );
299 
300  nvbio::transform<device_tag>(
301  count,
302  thrust::make_counting_iterator<uint32>(0),
303  mapq_dvec.begin(),
304  mapq_functor );
305  }
306 
308  reads,
309  reads,
310  genome_len,
311  genome,
312  scoring_scheme,
313  *this );
314 
315  //
316  // perform backtracking and compute cigars for the best alignments
317  //
318  {
319  // initialize cigars & MDS pools
320  cigar.clear();
321  mds.clear();
322 
323  timer.start();
324  device_timer.start();
325 
326  log_debug(stderr, "[%u] backtrack\n", ID);
328  0u,
329  count,
330  NULL,
332  BATCH_SIZE,
333  band_len,
334  traceback_state,
335  params );
336 
338  nvbio::cuda::check_error("backtracking kernel");
339  if (cigar.has_overflown())
340  throw nvbio::runtime_error("CIGAR vector overflow\n");
341 
342  device_timer.stop();
343  timer.stop();
344  stats.backtrack.add( count, timer.seconds(), device_timer.seconds() );
345 
346  timer.start();
347  device_timer.start();
348 
349  log_debug(stderr, "[%u] alignment\n", ID);
351  0u,
352  count,
353  NULL,
355  BATCH_SIZE,
356  band_len,
357  traceback_state,
358  input_scoring_scheme.sw, // always use Smith-Waterman for the final scoring of the found alignments
359  params );
360 
362  nvbio::cuda::check_error("alignment kernel");
363  if (mds.has_overflown())
364  throw nvbio::runtime_error("MDS vector overflow\n");
365 
366  device_timer.stop();
367  timer.stop();
368  stats.finalize.add( count, timer.seconds(), device_timer.seconds() );
369  }
370 
371  // wrap the results in a DeviceOutputBatchSE and process
372  if (output_file)
373  {
374  timer.start();
375 
376  io::DeviceOutputBatchSE gpu_batch(
377  count,
380  mds,
381  mapq_dvec);
382 
383  cpu_batch.readback( gpu_batch );
384 
385  timer.stop();
386  stats.alignments_DtoH.add( count, timer.seconds() );
387 
388  timer.start();
389 
390  log_debug(stderr, "[%u] output\n", ID);
391  output_file->process( cpu_batch );
392 
393  timer.stop();
394  stats.io.add( count, timer.seconds() );
395  }
396 
397  // TODO: decide whether to output second best alignments
398  #if 0
399  {
400  // clear mapq's
401  thrust::fill( mapq_dvec.begin(), mapq_dvec.begin() + count, uint8(255) );
402 
403  // overlap the second-best indices with the loc queue
404  thrust::device_vector<uint32>& second_idx_dvec = scoring_queues.hits.loc;
405 
406  // compact the indices of the second-best alignments
407  const uint32 n_second = uint32( thrust::copy_if(
408  thrust::make_counting_iterator(0u),
409  thrust::make_counting_iterator(0u) + count,
410  best_data_dvec.begin(),
411  second_idx_dvec.begin(),
412  io::has_second() ) - second_idx_dvec.begin() ); // FIXME: this has to be fixed since best_data_dvec is now
413  // a strided vector of io::Alignment's rather than a vector of
414  // io::BestAlignment's
415 
416  //
417  // perform backtracking and compute cigars for the second-best alignments
418  //
419  if (n_second)
420  {
421  // initialize cigars & MDS pools
422  cigar.clear();
423  mds.clear();
424 
425  timer.start();
426  device_timer.start();
427 
428  uint32* second_idx = thrust::raw_pointer_cast( &second_idx_dvec[0] );
429 
430  NVBIO_CUDA_DEBUG_STATEMENT( log_debug(stderr, "[%u] second-best backtrack (%u)\n", ID, n_second) );
432  1u,
433  n_second,
434  second_idx,
436  BATCH_SIZE,
437  band_len,
438  traceback_state,
439  params );
440 
442  nvbio::cuda::check_error("second-best backtracking kernel");
443  if (cigar.has_overflown())
444  throw nvbio::runtime_error("CIGAR vector overflow\n");
445 
446  device_timer.stop();
447  timer.stop();
448  stats.backtrack.add( n_second, timer.seconds(), device_timer.seconds() );
449 
450  timer.start();
451  device_timer.start();
452 
453  NVBIO_CUDA_DEBUG_STATEMENT( log_debug(stderr, "[%u] second-best alignment (%u)\n", ID, n_second) );
455  1u,
456  n_second,
457  second_idx,
459  BATCH_SIZE,
460  band_len,
461  traceback_state,
462  input_scoring_scheme.sw, // always use Smith-Waterman for the final scoring of the found alignments
463  params );
464 
466  nvbio::cuda::check_error("second-best alignment kernel");
467  if (mds.has_overflown())
468  throw nvbio::runtime_error("MDS vector overflow\n");
469 
470  device_timer.stop();
471  timer.stop();
472  stats.finalize.add( n_second, timer.seconds(), device_timer.seconds() );
473  }
474 
475  // wrap the results in a DeviceOutputBatchSE and process them
476  if (output_file)
477  {
478  timer.start();
479 
480  io::DeviceOutputBatchSE gpu_batch(
481  count,
484  mds,
485  mapq_dvec);
486 
487  cpu_batch.readback( gpu_batch );
488 
489  timer.stop();
490  stats.alignments_DtoH.add( count, timer.seconds() );
491 
492  timer.start();
493 
494  output_file->process( cpu_batch );
495 
496  timer.stop();
497  stats.io.add( count, timer.seconds() );
498  }
499  }
500  #endif
501 
502  // keep alignment stats
503  {
504  log_debug(stderr, "[%u] track stats\n", ID);
505  thrust::host_vector<io::Alignment> h_best_data( BATCH_SIZE*2 );
506  thrust::host_vector<uint8> h_mapq( BATCH_SIZE );
507 
508  thrust::copy( best_data_dvec.begin(), best_data_dvec.begin() + count, h_best_data.begin() );
509  thrust::copy( best_data_dvec.begin() + BATCH_SIZE, best_data_dvec.begin() + count + BATCH_SIZE, h_best_data.begin() + BATCH_SIZE );
510  thrust::copy( mapq_dvec.begin(), mapq_dvec.begin() + count, h_mapq.begin() );
511 
512  for (uint32 i = 0; i < count; ++i)
513  {
514  const io::BestAlignments best( h_best_data[i], h_best_data[i + BATCH_SIZE] );
515  const uint8 mapq = h_mapq[i];
516 
517  stats.track_alignment_statistics( &stats.mate1, best, mapq );
518  }
519  }
520 }
521 
522 template <
523  typename scoring_tag,
524  typename scoring_scheme_type>
526  const Params& params,
527  const fmi_type fmi,
528  const rfmi_type rfmi,
529  const scoring_scheme_type& scoring_scheme,
530  const io::SequenceDataDevice& reference_data,
531  const io::FMIndexDataDevice& driver_data,
532  const io::SequenceDataDevice& read_data,
533  const uint32 seeding_pass,
534  const uint32 seed_queue_size,
535  const uint32* seed_queue,
536  Stats& stats)
537 {
538  // prepare the scoring system
539  typedef typename scoring_scheme_type::threshold_score_type threshold_score_type;
540 
541  //threshold_score_type threshold_score = scoring_scheme.threshold_score();
542  const int32 score_limit = scoring_scheme_type::worst_score;
543 
544  Timer timer;
545  Timer global_timer;
546  nvbio::cuda::Timer device_timer;
547 
548 // const uint32 count = read_data.size();
549  const uint32 band_len = band_length( params.max_dist );
550 
551  // cast the reads to use proper iterators
552  const read_view_type reads_view = plain_view( read_data );
553  const read_batch_type reads( reads_view );
554 
555  // cast the genome to use proper iterators
556  const genome_view_type genome_view( plain_view( reference_data ) );
557  const genome_access_type genome_access( genome_view );
558  const uint32 genome_len = genome_access.bps();
559  const genome_iterator genome = genome_access.sequence_stream();
560 
561  NVBIO_VAR_UNUSED thrust::device_vector<uint32>::iterator hit_read_id_iterator = scoring_queues.hits.read_id.begin();
562  NVBIO_VAR_UNUSED thrust::device_vector<uint32>::iterator loc_queue_iterator = scoring_queues.hits.loc.begin();
563  NVBIO_VAR_UNUSED thrust::device_vector<int32>::iterator score_queue_iterator = scoring_queues.hits.score.begin();
564 
565  //
566  // At this point we have a queue full of reads, each with an associated set of
567  // seed hits encoded as a (sorted) list of SA ranges.
568  // For each read we need to:
569  // 1. select some seed hit to process (i.e. a row in one of the SA ranges)
570  // 2. locate it, i.e. converting from SA to linear coordinates
571  // 3. and score it
572  // until some search criteria are satisfied.
573  // The output queue is then reused in the next round as the input queue, and
574  // viceversa.
575  //
576 
578 
579  active_read_queues.resize( seed_queue_size );
580 
582  thrust::device_ptr<const uint32>( seed_queue ),
583  thrust::device_ptr<const uint32>( seed_queue ) + seed_queue_size,
584  active_read_queues.in_queue.begin(),
585  pack_read( params.top_seed ) );
586 
587  // keep track of the number of extensions performed for each of the active reads
588  uint32 n_ext = 0;
589 
591 
592  pipeline_type pipeline(
593  0u,
594  reads,
595  reads,
596  genome_len,
597  genome,
598  fmi,
599  rfmi,
600  scoring_scheme,
601  score_limit,
602  *this );
603 
604  // initialize the hit selection & scoring pipeline
605  select_init(
606  pipeline,
607  params );
608 
610  nvbio::cuda::check_error("select-init kernel");
611 
612  // prepeare the selection context
613  SelectBestApproxContext select_context( trys_dptr );
614 
615  for (uint32 extension_pass = 0; active_read_queues.in_size && n_ext < params.max_ext; ++extension_pass)
616  {
617  log_debug(stderr, "[%u] pass:\n[%u] batch: %u\n[%u] seeding pass: %d\n[%u] extension pass: %u\n", ID, ID, batch_number, ID, seeding_pass, ID, extension_pass);
618 
619  // check if we need to persist this seeding pass
620  if (params.persist_batch == -1 || batch_number == (uint32) params.persist_batch &&
621  params.persist_seeding == -1 || seeding_pass == (uint32) params.persist_seeding &&
622  params.persist_extension == -1 || extension_pass == (uint32) params.persist_extension)
623  persist_hits( params.persist_file, "hits", 0u, reads.size(), hit_deques );
624 
625  // initialize all the scoring output queues
627 
628  // sort the active read infos
629  #if 0
630  {
631  timer.start();
632  device_timer.start();
633 
634  NVBIO_CUDA_DEBUG_STATEMENT( log_debug(stderr, "[%u] read sort\n", ID) );
635  sort_inplace( active_read_queues.in_size, active_read_queues.raw_input_queue() );
636 
637  device_timer.stop();
638  timer.stop();
639  stats.sort.add( active_read_queues.in_size, timer.seconds(), device_timer.seconds() );
640  }
641  #endif
642 
643  timer.start();
644  device_timer.start();
645 
646  // keep track of how many hits per read we are generating
647  pipeline.n_hits_per_read = 1;
648 
649  if ((active_read_queues.in_size <= BATCH_SIZE/2) &&
650  (params.no_multi_hits == false))
651  {
652  //
653  // The queue of actively processed reads is very small: at this point
654  // it's better to select multiple hits to process in each round.
655  // This adds some book-keeping overheads, but allows to make larger
656  // kernel launches.
657  //
658 
659  // the maximum number of extensions we can perform in one iteration
660  // is at most 4096, as we use 12 bits to encode the extension index
661  const uint32 max_ext = std::min( 4096u, params.max_ext - n_ext );
662 
663  // try to generate BATCH_SIZE items to process
664  pipeline.n_hits_per_read = std::min(
665  BATCH_SIZE / active_read_queues.in_size,
666  max_ext );
667  }
668  // else
669  //
670  // The queue of actively processed reads is sufficiently large to allow
671  // selecting & scoring one seed hit at a time and still have large kernel
672  // launches. This is algorithmically the most efficient choice (because
673  // it enables frequent early outs), so let's choose it.
674  //
675 
676  // setup the hits queue according to whether we select multiple hits per read or not
677  scoring_queues.hits_index.setup( pipeline.n_hits_per_read, active_read_queues.in_size );
678 
679  // update pipeline
680  pipeline.scoring_queues = scoring_queues.device_view();
681 
682  log_debug(stderr, "[%u] select (%u active reads)\n", ID, active_read_queues.in_size);
683  select(
684  select_context,
685  pipeline,
686  params );
687 
689  nvbio::cuda::check_error("select kernel");
690 
691  // this sync point seems very much needed: if we don't place it, we won't see
692  // the right number of hits later on...
693  cudaDeviceSynchronize();
694 
695  device_timer.stop();
696  timer.stop();
697  stats.select.add( active_read_queues.in_size * pipeline.n_hits_per_read, timer.seconds(), device_timer.seconds() );
698 
699  // swap input & output queues
700  active_read_queues.swap();
701 
702  // update pipeline view
703  pipeline.scoring_queues = scoring_queues.device_view();
704 
705  // fetch the new queue size
706  if (active_read_queues.in_size == 0)
707  break;
708 
709  // fetch the output queue size
710  pipeline.hits_queue_size = pipeline.n_hits_per_read > 1 ? scoring_queues.hits_count() : active_read_queues.in_size;
711  if (pipeline.hits_queue_size == 0)
712  continue;
713 
714  // use the parent queue only if we chose the multiple-hits per read pipeline
715  // check if we need to persist this seeding pass
716  if (params.persist_batch == -1 || batch_number == (uint32) params.persist_batch &&
717  params.persist_seeding == -1 || seeding_pass == (uint32) params.persist_seeding &&
718  params.persist_extension == -1 || extension_pass == (uint32) params.persist_extension)
719  persist_selection( params.persist_file, "selection",
720  0u,
721  active_read_queues.in_size,
722  active_read_queues.raw_input_queue(),
723  pipeline.n_hits_per_read,
724  pipeline.hits_queue_size,
727 
728  log_debug(stderr, "[%u] selected %u hits\n", ID, pipeline.hits_queue_size);
729  NVBIO_CUDA_DEBUG_STATEMENT( log_debug( stderr, "[%u] crc: %llu\n", ID, device_checksum( hit_read_id_iterator, hit_read_id_iterator + pipeline.hits_queue_size ) ) );
730  NVBIO_CUDA_DEBUG_STATEMENT( log_debug( stderr, "[%u] crc: %llu\n", ID, device_checksum( loc_queue_iterator, loc_queue_iterator + pipeline.hits_queue_size ) ) );
731 
732  timer.start();
733  device_timer.start();
734 
735  // sort the selected hits by their SA coordinate
736  log_debug(stderr, "[%u] locate sort\n", ID);
737  pipeline.idx_queue = sort_hi_bits( pipeline.hits_queue_size, pipeline.scoring_queues.hits.loc );
738 
739  device_timer.stop();
740  timer.stop();
741  stats.sort.add( pipeline.hits_queue_size, timer.seconds(), device_timer.seconds() );
742 
743  timer.start();
744  device_timer.start();
745 
746  // NOTE: only 75-80% of these locations are unique.
747  // It might pay off to do a compaction beforehand.
748 
749  // and locate their position in linear coordinates
750  log_debug(stderr, "[%u] locate init\n", ID);
751  locate_init( pipeline, params );
752 
754 
755  log_debug(stderr, "[%u] locate lookup\n", ID);
756  locate_lookup( pipeline, params );
757 
759  nvbio::cuda::check_error("locating kernel");
760 
761  device_timer.stop();
762  timer.stop();
763  stats.locate.add( pipeline.hits_queue_size, timer.seconds(), device_timer.seconds() );
764 
765  NVBIO_CUDA_DEBUG_STATEMENT( log_debug( stderr, "[%u] crc: %llu\n", ID, device_checksum( loc_queue_iterator, loc_queue_iterator + pipeline.hits_queue_size ) ) );
766 
767  //
768  // Start the real scoring work...
769  //
770 
771  timer.start();
772  device_timer.start();
773 
774  // sort the selected hits by their linear genome coordinate
775  // TODO: sub-sort by read position/RC flag so as to (1) get better coherence,
776  // (2) allow removing duplicate extensions
777  log_debug(stderr, "[%u] score sort\n", ID);
778  pipeline.idx_queue = sort_hi_bits( pipeline.hits_queue_size, pipeline.scoring_queues.hits.loc );
779 
780  device_timer.stop();
781  timer.stop();
782  stats.sort.add( pipeline.hits_queue_size, timer.seconds(), device_timer.seconds() );
783 
784  //
785  // assign a score to all selected hits (currently in the output queue)
786  //
787  float score_time = 0.0f;
788  float dev_score_time = 0.0f;
789  timer.start();
790  device_timer.start();
791 
792  score_best(
793  band_len,
794  pipeline,
795  params );
796 
798  nvbio::cuda::check_error("score kernel");
799 
800  device_timer.stop();
801  timer.stop();
802  score_time += timer.seconds();
803  dev_score_time += device_timer.seconds();
804 
805  NVBIO_CUDA_DEBUG_STATEMENT( log_debug( stderr, "[%u] crc: %llu\n", ID, device_checksum( score_queue_iterator, score_queue_iterator + pipeline.hits_queue_size ) ) );
806 
807  // check if we need to persist this seeding pass
808  if (params.persist_batch == -1 || batch_number == (uint32) params.persist_batch &&
809  params.persist_seeding == -1 || seeding_pass == (uint32) params.persist_seeding &&
810  params.persist_extension == -1 || extension_pass == (uint32) params.persist_extension)
811  persist_scores( params.persist_file, "scores", 0u, reads.size(), pipeline.n_hits_per_read, pipeline.hits_queue_size, scoring_queues );
812 
813  timer.start();
814  device_timer.start();
815 
816  const ReduceBestApproxContext reduce_context( pipeline.trys, n_ext );
817 
818  // reduce the multiple scores to find the best two alignments
819  // (one thread per active read).
820  log_debug(stderr, "[%u] score reduce\n", ID);
821  score_reduce(
822  reduce_context,
823  pipeline,
824  params );
825 
827  nvbio::cuda::check_error("score-reduce kernel");
828 
829  // keep track of the number of extensions performed for each of the active reads
830  n_ext += pipeline.n_hits_per_read;
831 
832  device_timer.stop();
833  timer.stop();
834  stats.score.add( pipeline.hits_queue_size, score_time + timer.seconds(), dev_score_time + device_timer.seconds() );
835  }
836 }
837 
838 } // namespace cuda
839 } // namespace bowtie2
840 } // namespace nvbio