NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
aligner_best_approx_paired.h
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
38 #include <nvbio/basic/cuda/ldg.h>
39 #include <nvbio/basic/primitives.h>
40 
44 
45 namespace nvbio {
46 namespace bowtie2 {
47 namespace cuda {
48 
49 template <typename MapqCalculator, typename ReadData>
51 {
53  const uint32 _mate,
54  MapqCalculator _mapq_eval,
55  const io::Alignment* _best_data,
56  const io::Alignment* _best_data_o,
57  const uint32 _best_stride,
58  const ReadData _read_data1,
59  const ReadData _read_data2) :
60  mapq_eval( _mapq_eval ),
61  best_data( _best_data ),
62  best_data_o( _best_data_o ),
63  best_stride( _best_stride ),
64  read_data1( _read_data1 ),
65  read_data2( _read_data2 ),
66  mate( _mate ) {}
67 
69  uint8 operator() (const uint32 read_id)
70  {
71  io::BestAlignments best( best_data[ read_id ], best_data[ read_id + best_stride ] );
72  io::BestAlignments best_o( best_data_o[ read_id ], best_data_o[ read_id + best_stride ] );
73  io::BestPairedAlignments best_paired(
74  (mate == 0) ? best : best_o,
75  (mate == 0) ? best_o : best );
76 
77  const uint32 read_offset1 = read_data1.sequence_index()[ read_id ];
78  const uint32 read_len1 = read_data1.sequence_index()[ read_id + 1 ] - read_offset1;
79 
80  const uint32 read_offset2 = read_data2.sequence_index()[ read_id ];
81  const uint32 read_len2 = read_data2.sequence_index()[ read_id + 1 ] - read_offset2;
82 
83  return uint8( mapq_eval(
84  best_paired,
85  best_paired.anchor_mate<0>() ? read_len2 : read_len1,
86  best_paired.anchor_mate<0>() ? read_len1 : read_len2 ) );
87  }
88 
89  const MapqCalculator mapq_eval;
93  const ReadData read_data1;
94  const ReadData read_data2;
95  const uint32 mate;
96 };
97 
98 template <
99  typename scoring_tag>
101  const Params& params,
102  const fmi_type fmi,
103  const rfmi_type rfmi,
104  const UberScoringScheme& input_scoring_scheme,
105  const io::SequenceDataDevice& reference_data,
106  const io::FMIndexDataDevice& driver_data,
107  const io::SequenceDataDevice& read_data1,
108  const io::SequenceDataDevice& read_data2,
109  io::HostOutputBatchPE& cpu_batch,
110  Stats& stats)
111 {
112  // prepare the scoring system
113  typedef typename ScoringSchemeSelector<scoring_tag>::type scoring_scheme_type;
114  typedef typename scoring_scheme_type::threshold_score_type threshold_score_type;
115 
116  scoring_scheme_type scoring_scheme = ScoringSchemeSelector<scoring_tag>::scheme( input_scoring_scheme );
117 
118  threshold_score_type threshold_score = scoring_scheme.threshold_score();
119  //const int32 score_limit = scoring_scheme_type::worst_score;
120 
121  // start timing
122  nvbio::Timer timer;
123  nvbio::cuda::Timer device_timer;
124 
125  const uint32 count = read_data1.size();
126  const uint32 band_len = band_length( params.max_dist );
127 
128  // cast the genome to use proper iterators
129  const io::LdgSequenceDataView genome_view( plain_view( reference_data ) );
130  const genome_access_type genome_access( genome_view );
131  const uint32 genome_len = genome_access.bps();
132  const genome_iterator genome = genome_access.sequence_stream();
133 
134  // cast the reads to use proper iterators
135  const read_view_type reads_view1 = plain_view( read_data1 );
136  const read_view_type reads_view2 = plain_view( read_data2 );
137  const read_batch_type reads1( reads_view1 );
138  const read_batch_type reads2( reads_view2 );
139 
140  // initialize best-alignments
141  init_alignments( reads1, threshold_score, best_data_dptr, BATCH_SIZE, 0u );
142  init_alignments( reads2, threshold_score, best_data_dptr_o, BATCH_SIZE, 1u );
143 
144  for (uint32 anchor = 0; anchor < 2; ++anchor)
145  {
146  log_debug(stderr, "[%u] anchor mate: %u\n", ID, anchor);
147 
148  // start with a full seed queue
149  seed_queues.in_size = count;
150 
151  thrust::copy(
152  thrust::make_counting_iterator(0u),
153  thrust::make_counting_iterator(0u) + count,
154  seed_queues.in_queue.begin() );
155 
156  //
157  // Similarly to Bowtie2, we perform a number of seed & extension passes.
158  // Whether a read is re-seeded is determined at run time based on seed hit and
159  // alignment statistics.
160  // Hence, the number of reads actively processed in each pass can vary substantially.
161  // In order to keep the cores and all its lanes busy, we use a pair of input & output
162  // queues to compact the set of active reads in each round, swapping them at each
163  // iteration.
164  //
165 
166  const bool fw_strand =
167  (anchor == 0) ? (params.pe_policy == io::PE_POLICY_FF || // anchor = 0
168  params.pe_policy == io::PE_POLICY_FR) :
169  (params.pe_policy == io::PE_POLICY_FF || // anchor = 1
170  params.pe_policy == io::PE_POLICY_RF);
171  const bool fw = fw_strand ? params.fw : params.rc;
172  const bool rc = fw_strand ? params.rc : params.fw;
173 
174  for (uint32 seeding_pass = 0; seeding_pass < params.max_reseed+1; ++seeding_pass)
175  {
176  // check whether the input queue is empty
177  if (seed_queues.in_size == 0)
178  break;
179 
180  const uint32 n_active_reads = seed_queues.in_size;
181 
182  // initialize output seeding queue size
184 
185  // check if we need to persist this seeding pass
186  if (batch_number == (uint32) params.persist_batch &&
187  seeding_pass == (uint32) params.persist_seeding)
188  persist_reads( params.persist_file, "reads", anchor, n_active_reads, seed_queues.in_queue.begin() );
189 
190  //
191  // perform mapping
192  //
193  {
194  log_debug(stderr, "[%u] mapping (%u active reads)\n", ID, n_active_reads);
195  timer.start();
196  device_timer.start();
197 
198  // initialize the seed hit counts
200 
202 
203  map(
204  anchor ? reads2 : reads1, fmi, rfmi,
205  seeding_pass, seed_queues.device_view(),
206  reseed_dptr,
207  hits,
208  params,
209  fw,
210  rc );
211 
213  nvbio::cuda::check_error("mapping kernel");
214 
215  device_timer.stop();
216  timer.stop();
217  stats.map.add( n_active_reads, timer.seconds(), device_timer.seconds() );
218 /*
219  #if defined(NVBIO_CUDA_DEBUG)
220  {
221  uint64 crc, sum;
222  hits_checksum( n_active_reads, hit_deques, crc, sum );
223 
224  NVBIO_CUDA_DEBUG_STATEMENT( log_debug( stderr, "[%u] crc cnts: %llu\n", ID, device_checksum( hit_deques.counts().begin(), hit_deques.counts().begin() + count ) ) );
225  NVBIO_CUDA_DEBUG_STATEMENT( log_debug( stderr, "[%u] crc hits: %llu\n", ID, crc ) );
226  NVBIO_CUDA_DEBUG_STATEMENT( log_debug( stderr, "[%u] sum hits: %llu\n", ID, sum ) );
227  }
228  #endif
229 */
230  // check if we need to persist this seeding pass
231  if (batch_number == (uint32) params.persist_batch &&
232  seeding_pass == (uint32) params.persist_seeding)
233  persist_hits( params.persist_file, "hits", anchor, count, hit_deques );
234  }
235 
236  // take some stats on the hits we got
237  if (seeding_pass == 0 && anchor == 0 && params.keep_stats)
238  keep_stats( reads1.size(), stats );
239 
240  best_approx_score<scoring_tag>(
241  params,
242  fmi,
243  rfmi,
244  scoring_scheme,
245  reference_data,
246  driver_data,
247  anchor,
248  anchor ? read_data2 : read_data1,
249  anchor ? read_data1 : read_data2,
250  seeding_pass,
253  stats );
254 
255  // copy the reads that need reseeding
259  reseed_dptr,
261  temp_dvec );
262 
263  // swap input & output queues
264  seed_queues.swap();
265  }
266  }
267 
268  //
269  // At this point, for each read we have the scores and rough alignment positions of the
270  // best two alignments: to compute the final results we need to backtrack the DP extension,
271  // and compute accessory CIGARs and MD strings.
272  //
273 
274  thrust::device_vector<io::Alignment>::iterator best_anchor_iterator = best_data_dvec.begin();
275  thrust::device_vector<io::Alignment>::iterator best_opposite_iterator = best_data_dvec_o.begin();
276 
277  io::Alignment* best_anchor_ptr = thrust::raw_pointer_cast( best_anchor_iterator.base() );
278  io::Alignment* best_opposite_ptr = thrust::raw_pointer_cast( best_opposite_iterator.base() );
279 
280  // reclassify here "discordant" alignments as "paired" (if discordant alignments are allowed)
281  if (params.pe_discordant)
282  {
284  count,
285  best_anchor_ptr,
286  best_opposite_ptr,
287  BATCH_SIZE );
288  }
289 
290  // compute mapq for the first mate
291  {
292  log_debug(stderr, "[%u] compute mapq\n", ID);
293  typedef BowtieMapq2< SmithWatermanScoringScheme<> > mapq_evaluator_type;
294 
295  mapq_evaluator_type mapq_eval( input_scoring_scheme.sw );
296 
297  MapqFunctorPE<mapq_evaluator_type,read_view_type> mapq_functor( 0u, mapq_eval, best_anchor_ptr, best_opposite_ptr, BATCH_SIZE, reads_view1, reads_view2 );
298 
299  nvbio::transform<device_tag>(
300  count,
301  thrust::make_counting_iterator<uint32>(0),
302  mapq_dvec.begin(),
303  mapq_functor );
304  }
305 
307  reads1,
308  reads2,
309  genome_len,
310  genome,
311  scoring_scheme,
312  *this );
313 
314  //
315  // perform backtracking and compute cigars for the best alignments
316  //
317  {
318  // initialize cigars & MDS
319  cigar.clear();
320  mds.clear();
321 
322  timer.start();
323  device_timer.start();
324 
325  log_debug(stderr, "[%u] backtracking\n", ID);
327  0u,
328  count,
329  NULL,
330  best_anchor_ptr,
331  BATCH_SIZE,
332  band_len,
333  traceback_state,
334  params );
335 
337  nvbio::cuda::check_error("backtracking kernel");
338  if (cigar.has_overflown())
339  throw nvbio::runtime_error("CIGAR vector overflow\n");
340 
341  device_timer.stop();
342  timer.stop();
343  stats.backtrack.add( count, timer.seconds(), device_timer.seconds() );
344 
345  timer.start();
346  device_timer.start();
347 
348  log_debug(stderr, "[%u] alignment\n", ID);
350  0u,
351  count,
352  NULL,
353  best_anchor_ptr,
354  BATCH_SIZE,
355  band_len,
356  traceback_state,
357  input_scoring_scheme.sw, // always use Smith-Waterman for the final scoring of the found alignments
358  params );
359 
361  nvbio::cuda::check_error("alignment kernel");
362  if (mds.has_overflown())
363  throw nvbio::runtime_error("MDS vector overflow\n");
364 
365  device_timer.stop();
366  timer.stop();
367  stats.finalize.add( count, timer.seconds(), device_timer.seconds() );
368  }
369 
370  // wrap the results in a DeviceOutputBatchSE and process it
371  {
372  timer.start();
373 
374  io::DeviceOutputBatchSE gpu_batch(
375  count,
378  mds,
379  mapq_dvec);
380 
381  cpu_batch.readback( gpu_batch, io::MATE_1 );
382 
383  timer.stop();
384  stats.alignments_DtoH.add( count, timer.seconds() );
385  }
386 
387  // compute mapq for the second mate
388  {
389  log_debug(stderr, "[%u] compute mapq\n", ID);
390  typedef BowtieMapq2< SmithWatermanScoringScheme<> > mapq_evaluator_type;
391 
392  mapq_evaluator_type mapq_eval( input_scoring_scheme.sw );
393 
394  MapqFunctorPE<mapq_evaluator_type,read_view_type> mapq_functor( 1u, mapq_eval, best_anchor_ptr, best_opposite_ptr, BATCH_SIZE, reads_view1, reads_view2 );
395 
396  nvbio::transform<device_tag>(
397  count,
398  thrust::make_counting_iterator<uint32>(0),
399  mapq_dvec.begin(),
400  mapq_functor );
401  }
402 
403  //
404  // perform backtracking and compute cigars for the opposite mates of the best paired alignments
405  //
406  {
407  // initialize cigars & MDS
408  cigar.clear();
409  mds.clear();
410 
411  //
412  // these alignments are of two kinds: concordant, or discordant: this requires some attention,
413  // as true opposite paired alignments require full DP backtracking, while unpaired or discordant
414  // alignments require the banded version.
415  //
416  timer.start();
417  device_timer.start();
418 
419  // overlap the paired indices with the loc queue
420  thrust::device_vector<uint32>::iterator paired_idx_begin = scoring_queues.hits.loc.begin();
421 
422  // compact the indices of the best concordant alignments
423  const uint32 n_paired = uint32( thrust::copy_if(
424  thrust::make_counting_iterator(0u),
425  thrust::make_counting_iterator(0u) + count,
426  best_opposite_iterator,
427  paired_idx_begin,
428  io::is_concordant() ) - paired_idx_begin );
429 
430  if (n_paired)
431  {
432  log_debug(stderr, "[%u] paired opposite: %u\n", ID, n_paired);
433  const uint32* paired_idx = thrust::raw_pointer_cast( paired_idx_begin.base() );
434 
435  log_debug(stderr, "[%u] paired opposite backtracking\n", ID);
437  0u,
438  n_paired,
439  paired_idx,
440  best_opposite_ptr,
441  BATCH_SIZE,
442  traceback_state,
443  params );
444 
446  nvbio::cuda::check_error("paired opposite backtracking kernel");
447  if (cigar.has_overflown())
448  throw nvbio::runtime_error("CIGAR vector overflow\n");
449  }
450 
451  // overlap the unpaired indices with the loc queue
452  thrust::device_vector<uint32>::iterator unpaired_idx_begin = scoring_queues.hits.loc.begin();
453 
454  // compact the indices of the best unpaired or discordant alignments
455  const uint32 n_unpaired = uint32( thrust::copy_if(
456  thrust::make_counting_iterator(0u),
457  thrust::make_counting_iterator(0u) + count,
458  best_opposite_iterator,
459  unpaired_idx_begin,
460  io::is_not_concordant() ) - unpaired_idx_begin );
461 
462  if (n_unpaired)
463  {
464  log_debug(stderr, "[%u] unpaired opposite: %u\n", ID, n_unpaired);
465  const uint32* unpaired_idx = thrust::raw_pointer_cast( unpaired_idx_begin.base() );
466 
467  log_debug(stderr, "[%u] unpaired opposite backtracking\n", ID);
469  0u,
470  n_unpaired,
471  unpaired_idx,
472  best_opposite_ptr,
473  BATCH_SIZE,
474  band_len,
475  traceback_state,
476  params );
477 
479  nvbio::cuda::check_error("unpaired opposite backtracking kernel");
480  if (cigar.has_overflown())
481  throw nvbio::runtime_error("CIGAR vector overflow\n");
482  }
483 
484  device_timer.stop();
485  timer.stop();
486  stats.backtrack_opposite.add( n_paired + n_unpaired, timer.seconds(), device_timer.seconds() );
487 
488  timer.start();
489  device_timer.start();
490 
491  thrust::device_vector<uint32>::iterator aligned_idx_begin = scoring_queues.hits.loc.begin();
492 
493  // compact the indices of the best opposite alignments (concordant or not)
494  const uint32 n_aligned = uint32( thrust::copy_if(
495  thrust::make_counting_iterator(0u),
496  thrust::make_counting_iterator(0u) + count,
497  best_opposite_iterator,
498  aligned_idx_begin,
499  io::is_aligned() ) - aligned_idx_begin );
500 
501  if (n_aligned)
502  {
503  log_debug(stderr, "[%u] opposite alignment: %u\n", ID, n_aligned);
504  const uint32* aligned_idx = thrust::raw_pointer_cast( aligned_idx_begin.base() );
505 
507  0u,
508  n_aligned,
509  aligned_idx,
510  best_opposite_ptr,
511  BATCH_SIZE,
512  band_len,
513  traceback_state,
514  input_scoring_scheme.sw, // always use Smith-Waterman for the final scoring of the found alignments
515  params );
516 
518  nvbio::cuda::check_error("opposite alignment kernel");
519  if (mds.has_overflown())
520  throw nvbio::runtime_error("MDS vector overflow\n");
521  }
522 
523  device_timer.stop();
524  timer.stop();
525  stats.finalize.add( count, timer.seconds(), device_timer.seconds() );
526 
527  // wrap the results in a DeviceOutputBatchSE and process it
528  if (output_file)
529  {
530  timer.start();
531 
532  io::DeviceOutputBatchSE gpu_batch(
533  count,
536  mds,
537  mapq_dvec);
538 
539  cpu_batch.readback( gpu_batch, io::MATE_2 );
540 
541  timer.stop();
542  stats.alignments_DtoH.add( count, timer.seconds() );
543 
544  timer.start();
545 
546  output_file->process( cpu_batch );
547 
548  timer.stop();
549  stats.io.add( count, timer.seconds() );
550  }
551  }
552 
553  // TODO: decide whether to output second best alignments
554  #if 0
555  {
556  // clear mapq's
557  thrust::fill( mapq_dvec.begin(), mapq_dvec.begin() + count, uint8(255) );
558 
559  // overlap the second-best indices with the loc queue
560  thrust::device_vector<uint32>::iterator second_idx_begin = scoring_queues.hits.loc.begin();
561 
562  // compact the indices of the second-best alignments
563  const uint32 n_second = uint32( thrust::copy_if(
564  thrust::make_counting_iterator(0u),
565  thrust::make_counting_iterator(0u) + count,
566  best_anchor_iterator,
567  second_idx_begin,
568  io::has_second() ) - second_idx_begin );
569 
570  //
571  // perform backtracking and compute cigars for the second-best alignments
572  //
573  if (n_second)
574  {
575  // initialize cigars & MDS
576  cigar.clear();
577  mds.clear();
578 
579  timer.start();
580  device_timer.start();
581 
582  NVBIO_CUDA_DEBUG_STATEMENT( log_debug(stderr, "[%u] second-best: %u\n", ID, n_second) );
583  const uint32* second_idx = thrust::raw_pointer_cast( second_idx_begin.base() );
584 
585  NVBIO_CUDA_DEBUG_STATEMENT( log_debug(stderr, "[%u] second-best backtracking\n", ID) );
587  1u,
588  n_second,
589  second_idx,
590  best_anchor_ptr,
591  BATCH_SIZE,
592  band_len,
593  traceback_state,
594  params );
595 
597  nvbio::cuda::check_error("second-best backtracking kernel");
598  if (cigar.has_overflown())
599  throw nvbio::runtime_error("CIGAR vector overflow\n");
600 
601  device_timer.stop();
602  timer.stop();
603  stats.backtrack.add( n_second, timer.seconds(), device_timer.seconds() );
604 
605  timer.start();
606  device_timer.start();
607 
608  NVBIO_CUDA_DEBUG_STATEMENT( log_debug(stderr, "[%u] second-best alignment\n", ID) );
610  1u,
611  n_second,
612  second_idx,
613  best_anchor_ptr,
614  BATCH_SIZE,
615  band_len,
616  traceback_state,
617  input_scoring_scheme.sw, // always use Smith-Waterman for the final scoring of the found alignments
618  params );
619 
621  nvbio::cuda::check_error("second-best alignment kernel");
622  if (mds.has_overflown())
623  throw nvbio::runtime_error("MDS vector overflow\n");
624 
625  device_timer.stop();
626  timer.stop();
627  stats.finalize.add( n_second, timer.seconds(), device_timer.seconds() );
628  }
629 
630  // wrap the results in a DeviceOutputBatchSE and process it
631  {
632  timer.start();
633 
634  io::DeviceOutputBatchSE gpu_batch(
635  count,
638  mds,
639  mapq_dvec);
640 
641  cpu_batch.readback( gpu_batch, io::MATE_1 );
642 
643  timer.stop();
644  stats.alignments_DtoH.add( count, timer.seconds() );
645  }
646 
647  //
648  // perform backtracking and compute cigars for the opposite mates of the second-best paired alignments
649  //
650  {
651  // initialize cigars & MDS pools
652  cigar.clear();
653  mds.clear();
654 
655  timer.start();
656  device_timer.start();
657 
658  //
659  // these alignments are of two kinds: paired, or unpaired: this requires some attention,
660  // as true opposite paired alignments require full DP backtracking, while unpaired
661  // alignments require the banded version.
662  //
663 
664  // compact the indices of the second-best paired alignments
665  const uint32 n_second_paired = uint32( thrust::copy_if(
666  thrust::make_counting_iterator(0u),
667  thrust::make_counting_iterator(0u) + count,
668  best_opposite_iterator,
669  second_idx_begin,
670  io::has_second_paired() ) - second_idx_begin );
671 
672  const uint32* second_idx = thrust::raw_pointer_cast( second_idx_begin.base() );
673 
674  if (n_second_paired)
675  {
676  NVBIO_CUDA_DEBUG_STATEMENT( log_debug(stderr, "[%u] second-best paired: %u\n", ID, n_second_paired) );
677 
678  NVBIO_CUDA_DEBUG_STATEMENT( log_debug(stderr, "[%u] second-best paired opposite backtracking\n", ID) );
680  1u,
681  n_second_paired,
682  second_idx,
683  best_opposite_ptr,
684  BATCH_SIZE,
685  traceback_state,
686  params );
687 
689  nvbio::cuda::check_error("second-best paired opposite backtracking kernel");
690  if (cigar.has_overflown())
691  throw nvbio::runtime_error("CIGAR vector overflow\n");
692  }
693 
694  // compact the indices of the second-best unpaired alignments
695  const uint32 n_second_unpaired = uint32( thrust::copy_if(
696  thrust::make_counting_iterator(0u),
697  thrust::make_counting_iterator(0u) + count,
698  best_opposite_iterator,
699  second_idx_begin,
700  io::has_second_unpaired() ) - second_idx_begin );
701 
702  if (n_second_unpaired)
703  {
704  NVBIO_CUDA_DEBUG_STATEMENT( log_debug(stderr, "[%u] second-best unpaired: %u\n", ID, n_second_unpaired) );
705 
706  NVBIO_CUDA_DEBUG_STATEMENT( log_debug(stderr, "[%u] second-best unpaired opposite backtracking\n", ID) );
708  1u,
709  n_second_unpaired,
710  second_idx,
711  best_opposite_ptr,
712  BATCH_SIZE,
713  band_len,
714  traceback_state,
715  params );
716 
718  nvbio::cuda::check_error("second-best unpaired opposite backtracking kernel");
719  if (cigar.has_overflown())
720  throw nvbio::runtime_error("CIGAR vector overflow\n");
721  }
722 
723  device_timer.stop();
724  timer.stop();
725  stats.backtrack_opposite.add( n_second_paired + n_second_unpaired, timer.seconds(), device_timer.seconds() );
726 
727  timer.start();
728  device_timer.start();
729 
730  // compact the indices of the second-best alignments
731  const uint32 n_second = uint32( thrust::copy_if(
732  thrust::make_counting_iterator(0u),
733  thrust::make_counting_iterator(0u) + count,
734  best_opposite_iterator,
735  second_idx_begin,
736  io::has_second() ) - second_idx_begin );
737 
738  if (n_second)
739  {
740  // compute alignment only on the opposite mates with a second-best
741  NVBIO_CUDA_DEBUG_STATEMENT( log_debug(stderr, "[%u] second-best opposite alignment\n", ID) );
743  1u,
744  n_second,
745  second_idx,
746  best_opposite_ptr,
747  BATCH_SIZE,
748  band_len,
749  traceback_state,
750  input_scoring_scheme.sw, // always use Smith-Waterman for the final scoring of the found alignments
751  params );
752 
754  nvbio::cuda::check_error("second-best opposite alignment kernel");
755  if (mds.has_overflown())
756  throw nvbio::runtime_error("MDS vector overflow\n");
757  }
758 
759  device_timer.stop();
760  timer.stop();
761  stats.finalize.add( n_second, timer.seconds(), device_timer.seconds() );
762 
763  // wrap the results in a DeviceOutputBatchSE and process it
764  if (output_file)
765  {
766  timer.start();
767 
768  io::DeviceOutputBatchSE gpu_batch(
769  count,
772  mds,
773  mapq_dvec);
774 
775  cpu_batch.readback( gpu_batch, io::MATE_2 );
776 
777  timer.stop();
778  stats.alignments_DtoH.add( count, timer.seconds() );
779 
780  timer.start();
781 
782  output_file->process( cpu_batch );
783 
784  timer.stop();
785  stats.io.add( count, timer.seconds() );
786  }
787  }
788  }
789  #endif
790 
791  // keep alignment stats
792  {
793  log_debug(stderr, "[%u] track stats\n", ID);
794  thrust::host_vector<io::Alignment> h_best_data( BATCH_SIZE*2 );
795  thrust::host_vector<io::Alignment> h_best_data_o( BATCH_SIZE*2 );
796  thrust::host_vector<uint8> h_mapq( BATCH_SIZE );
797 
798  thrust::copy( best_data_dvec.begin(), best_data_dvec.begin() + count, h_best_data.begin() );
799  thrust::copy( best_data_dvec.begin() + BATCH_SIZE, best_data_dvec.begin() + count + BATCH_SIZE, h_best_data.begin() + BATCH_SIZE );
800  thrust::copy( best_data_dvec_o.begin(), best_data_dvec_o.begin() + count, h_best_data_o.begin() );
801  thrust::copy( best_data_dvec_o.begin() + BATCH_SIZE, best_data_dvec_o.begin() + count + BATCH_SIZE, h_best_data_o.begin() + BATCH_SIZE );
802  thrust::copy( mapq_dvec.begin(), mapq_dvec.begin() + count, h_mapq.begin() );
803 
804  for (uint32 i = 0; i < count; ++i)
805  {
806  const io::BestAlignments best( h_best_data[i], h_best_data[i + BATCH_SIZE] );
807  const io::BestAlignments best_o( h_best_data_o[i], h_best_data_o[i + BATCH_SIZE] );
808  const uint8 mapq = h_mapq[i];
809 
810  stats.track_alignment_statistics( best, best_o, mapq );
811  }
812  }
813 
814  // increase the batch number
815  ++batch_number;
816 }
817 
818 template <
819  typename scoring_tag,
820  typename scoring_scheme_type>
822  const Params& params,
823  const fmi_type fmi,
824  const rfmi_type rfmi,
825  const scoring_scheme_type& scoring_scheme,
826  const io::SequenceDataDevice& reference_data,
827  const io::FMIndexDataDevice& driver_data,
828  const uint32 anchor,
829  const io::SequenceDataDevice& read_data1,
830  const io::SequenceDataDevice& read_data2,
831  const uint32 seeding_pass,
832  const uint32 seed_queue_size,
833  const uint32* seed_queue,
834  Stats& stats)
835 {
836  log_debug(stderr, "[%u] score\n", ID);
837  // prepare the scoring system
838  typedef typename scoring_scheme_type::threshold_score_type threshold_score_type;
839 
840  //threshold_score_type threshold_score = scoring_scheme.threshold_score();
841  const int32 score_limit = scoring_scheme_type::worst_score;
842 
843  // start processing
844  Timer timer;
845  Timer global_timer;
846  nvbio::cuda::Timer device_timer;
847 
848  global_timer.start();
849 
850  const uint32 band_len = band_length( params.max_dist );
851 
852  const read_view_type reads_view1 = plain_view( read_data1 );
853  const read_view_type reads_view2 = plain_view( read_data2 );
854  const read_batch_type reads1( reads_view1 );
855  const read_batch_type reads2( reads_view2 );
856 
857  const io::LdgSequenceDataView genome_view( plain_view( reference_data ) );
858  const genome_access_type genome_access( genome_view );
859  const uint32 genome_len = genome_access.bps();
860  const genome_iterator genome = genome_access.sequence_stream();
861 
862  NVBIO_VAR_UNUSED thrust::device_vector<uint32>::iterator loc_queue_iterator = scoring_queues.hits.loc.begin();
863  thrust::device_vector<int32>::iterator score_queue_iterator = scoring_queues.hits.score.begin();
864  thrust::device_vector<int32>::iterator opposite_score_queue_iterator = scoring_queues.hits.opposite_score.begin();
865  thrust::device_vector<uint32>::iterator opposite_queue_iterator = opposite_queue_dvec.begin();
866 
867  //
868  // At this point we have a queue full of reads, each with an associated set of
869  // seed hits encoded as a (sorted) list of SA ranges.
870  // For each read we need to:
871  // 1. select some seed hit to process (i.e. a row in one of the SA ranges)
872  // 2. locate it, i.e. converting from SA to linear coordinates
873  // 3. and score it
874  // until some search criteria are satisfied.
875  // The output queue is then reused in the next round as the input queue, and
876  // viceversa.
877  //
879 
880  active_read_queues.resize( seed_queue_size );
881 
883  thrust::device_ptr<const uint32>( seed_queue ),
884  thrust::device_ptr<const uint32>( seed_queue ) + seed_queue_size,
885  active_read_queues.in_queue.begin(),
886  pack_read( params.top_seed ) );
887 
888  // keep track of the number of extensions performed for each of the active reads
889  uint32 n_ext = 0;
890 
892 
893  pipeline_type pipeline(
894  anchor,
895  reads1,
896  reads2,
897  genome_len,
898  genome,
899  fmi,
900  rfmi,
901  scoring_scheme,
902  score_limit,
903  *this );
904 
905  // initialize the hit selection & scoring pipeline
906  select_init(
907  pipeline,
908  params );
909 
911  nvbio::cuda::check_error("select-init kernel");
912 
913  // prepeare the selection context
914  SelectBestApproxContext select_context( trys_dptr );
915 
916  for (uint32 extension_pass = 0; active_read_queues.in_size && n_ext < params.max_ext; ++extension_pass)
917  {
918  log_debug(stderr, "[%u] pass:\n[%u] batch:[%u] %u\n[%u] seeding pass: %u\n[%u] extension pass: %u\n", ID, ID, batch_number, ID, seeding_pass, ID, extension_pass);
919 
920  // initialize all the scoring output queues
922 
923  // sort the active read infos
924  #if 0
925  {
926  timer.start();
927  device_timer.start();
928 
929  NVBIO_CUDA_DEBUG_STATEMENT( log_debug(stderr, "[%u] read sort\n", ID) );
930  sort_inplace( active_read_queues.in_size, active_read_queues.raw_input_queue() );
931 
932  device_timer.stop();
933  timer.stop();
934  stats.sort.add( active_read_queues.in_size, timer.seconds(), device_timer.seconds() );
935  }
936  #endif
937 
938  timer.start();
939  device_timer.start();
940 
941  // keep track of how many hits per read we are generating
942  pipeline.n_hits_per_read = 1;
943 
944  if ((active_read_queues.in_size <= BATCH_SIZE/2) &&
945  (params.no_multi_hits == false))
946  {
947  //
948  // The queue of actively processed reads is very small: at this point
949  // it's better to select multiple hits to process in each round.
950  // This adds some book keeping overheads, but allows to make larger
951  // kernel launches.
952  //
953 
954  // the maximum number of extensions we can perform in one iteration
955  // is at most 4096, as we use 12 bits to encode the extension index
956  const uint32 max_ext = std::min( 4096u, params.max_ext - n_ext );
957 
958  // try to generate BATCH_SIZE items to process
959  pipeline.n_hits_per_read = std::min(
960  BATCH_SIZE / active_read_queues.in_size,
961  max_ext );
962  }
963  // else
964  //
965  // The queue of actively processed reads is sufficiently large to allow
966  // selecting & scoring one seed hit at a time and still have large kernel
967  // launches. This is algorithmically the most efficient choice (because
968  // it enables frequent early outs), so let's choose it.
969  //
970 
971  // setup the hits queue according to whether we select multiple hits per read or not
972  scoring_queues.hits_index.setup( pipeline.n_hits_per_read, active_read_queues.in_size );
973 
974  // update pipeline
975  pipeline.scoring_queues = scoring_queues.device_view();
976 
977  log_debug(stderr, "[%u] select\n", ID);
978  select(
979  select_context,
980  pipeline,
981  params );
982 
984  nvbio::cuda::check_error("select kernel");
985 
986  // this sync point seems very much needed: if we don't place it, we won't see
987  // the right number of hits later on...
988  cudaDeviceSynchronize();
989 
990  device_timer.stop();
991  timer.stop();
992  stats.select.add( active_read_queues.in_size * pipeline.n_hits_per_read, timer.seconds(), device_timer.seconds() );
993 
994  // swap input & output queues
995  active_read_queues.swap();
996 
997  // update pipeline view
998  pipeline.scoring_queues = scoring_queues.device_view();
999 
1000  // fetch the new queue size
1001  if (active_read_queues.in_size == 0)
1002  break;
1003 
1004  // fetch the output queue size
1005  pipeline.hits_queue_size = pipeline.n_hits_per_read > 1 ? scoring_queues.hits_count() : active_read_queues.in_size;
1006  if (pipeline.hits_queue_size == 0)
1007  continue;
1008 
1009  // use the parent queue only if we chose the multiple-hits per read pipeline
1010  // check if we need to persist this seeding pass
1011  if (batch_number == (uint32) params.persist_batch &&
1012  seeding_pass == (uint32) params.persist_seeding &&
1013  extension_pass == (uint32) params.persist_extension)
1014  persist_selection( params.persist_file, "selection",
1015  anchor,
1016  active_read_queues.in_size,
1017  active_read_queues.raw_input_queue(),
1018  pipeline.n_hits_per_read,
1019  pipeline.hits_queue_size,
1021  scoring_queues.hits );
1022 
1023  log_debug(stderr, "[%u] selected %u hits\n", ID, pipeline.hits_queue_size);
1024 
1025  timer.start();
1026  device_timer.start();
1027 
1028  // sort the selected hits by their SA coordinate
1029  log_debug(stderr, "[%u] locate sort\n", ID);
1030  pipeline.idx_queue = sort_hi_bits( pipeline.hits_queue_size, pipeline.scoring_queues.hits.loc );
1031 
1032  device_timer.stop();
1033  timer.stop();
1034  stats.sort.add( pipeline.hits_queue_size, timer.seconds(), device_timer.seconds() );
1035 
1036  timer.start();
1037  device_timer.start();
1038 
1039  // NOTE: only 75-80% of these locations are unique.
1040  // It might pay off to do a compaction beforehand.
1041 
1042  // and locate their position in linear coordinates
1043  log_debug(stderr, "[%u] locate init\n", ID);
1044  locate_init( pipeline, params );
1045 
1047 
1048  log_debug(stderr, "[%u] locate lookup\n", ID);
1049  locate_lookup( pipeline, params );
1050 
1052  nvbio::cuda::check_error("locating kernel");
1053 
1054  device_timer.stop();
1055  timer.stop();
1056  stats.locate.add( pipeline.hits_queue_size, timer.seconds(), device_timer.seconds() );
1057 
1058  NVBIO_CUDA_DEBUG_STATEMENT( log_debug( stderr, "[%u] crc: %llu\n", ID, device_checksum( loc_queue_iterator, loc_queue_iterator + pipeline.hits_queue_size ) ) );
1059 
1060  //
1061  // Start the real scoring work...
1062  //
1063 
1064  timer.start();
1065  device_timer.start();
1066 
1067  // sort the selected hits by their linear genome coordinate
1068  // TODO: sub-sort by read position/RC flag so as to (1) get better coherence,
1069  // (2) allow removing duplicate extensions
1070  log_debug(stderr, "[%u] score sort\n", ID);
1071  pipeline.idx_queue = sort_hi_bits( pipeline.hits_queue_size, pipeline.scoring_queues.hits.loc );
1072 
1073  device_timer.stop();
1074  timer.stop();
1075  stats.sort.add( pipeline.hits_queue_size, timer.seconds(), device_timer.seconds() );
1076 
1077  //
1078  // assign a score to all selected hits (currently in the output queue)
1079  //
1080  float score_time = 0.0f;
1081  float dev_score_time = 0.0f;
1082  timer.start();
1083  device_timer.start();
1084 
1086  band_len,
1087  pipeline,
1088  params );
1089 
1091  nvbio::cuda::check_error("score kernel");
1092 
1093  device_timer.stop();
1094  timer.stop();
1095  score_time += timer.seconds();
1096  dev_score_time += device_timer.seconds();
1097 
1098  NVBIO_CUDA_DEBUG_STATEMENT( log_debug( stderr, "[%u] crc: %llu\n", ID, device_checksum( score_queue_iterator, score_queue_iterator + pipeline.hits_queue_size ) ) );
1099 
1100  //
1101  // compact the list of candidate hits (with a valid anchor mate score) and perform DP alignment
1102  // on the opposite mates
1103  //
1104 
1105  timer.start();
1106  device_timer.start();
1107 
1108  // NOTE:
1109  // Here we want the output opposite_queue to contain the list of indices *into* idx_queue, so as to keep the
1110  // original sorting order by genome coordintes used for scoring the anchor.
1111  // Down the pipeline the scoring kernel will address the problems by idx_queue[ opposite_score_queue[ thread_id ] ].
1112  log_debug(stderr, "[%u] compact opposite\n", ID);
1113  const int32 worst_score = scoring_scheme_type::worst_score;
1114 
1115  pipeline.opposite_queue_size = uint32( thrust::copy_if(
1116  thrust::make_counting_iterator(0u),
1117  thrust::make_counting_iterator(0u) + pipeline.hits_queue_size,
1118  thrust::make_permutation_iterator( score_queue_iterator, thrust::device_ptr<uint32>( pipeline.idx_queue ) ), // gather from the indexed score queue
1119  opposite_queue_iterator,
1120  bind_second_functor< not_equal_functor<int32> >( worst_score ) ) - opposite_queue_iterator );
1121 
1122  // make sure the reducer sees correct scores
1123  thrust::fill(
1124  opposite_score_queue_iterator,
1125  opposite_score_queue_iterator + pipeline.hits_queue_size,
1126  worst_score );
1127 
1128  if (pipeline.opposite_queue_size)
1129  {
1130  // perform DP alignment on the opposite mates
1131  log_debug(stderr, "[%u] score opposite (%u)\n", ID, pipeline.opposite_queue_size );
1133  pipeline,
1134  params );
1135 
1136  NVBIO_CUDA_DEBUG_STATEMENT( log_debug( stderr, "[%u] crc: %llu\n", device_checksum( opposite_score_queue_iterator, opposite_score_queue_iterator + pipeline.hits_queue_size ) ) );
1137 
1138  // check if we need to persist this seeding pass
1139  if (batch_number == (uint32) params.persist_batch &&
1140  seeding_pass == (uint32) params.persist_seeding &&
1141  extension_pass == (uint32) params.persist_extension)
1142  persist_scores( params.persist_file, "opposite-score",
1143  anchor,
1144  active_read_queues.in_size,
1145  pipeline.n_hits_per_read,
1146  pipeline.hits_queue_size,
1147  scoring_queues );
1148  }
1149 
1151  nvbio::cuda::check_error("opposite-score kernel");
1152 
1153  device_timer.stop();
1154  timer.stop();
1155  stats.opposite_score.add( pipeline.opposite_queue_size, timer.seconds(), device_timer.seconds() );
1156  //stats.opposite_score.user[0] += score_mate_work_queue_stats.utilization(nvbio::cuda::STREAM_EVENT);
1157  //stats.opposite_score.user[1] += score_mate_work_queue_stats.utilization(nvbio::cuda::RUN_EVENT);
1158  //stats.opposite_score.user[2] += score_mate_work_queue_stats.avg_iterations();
1159  //stats.opposite_score.user[3] = nvbio::max( stats.opposite_score.user[3], score_mate_work_queue_stats.max_iterations() - score_mate_work_queue_stats.avg_iterations() );
1160 
1161  timer.start();
1162  device_timer.start();
1163 
1164  const ReduceBestApproxContext reduce_context( pipeline.trys, n_ext );
1165 
1166  // reduce the multiple scores to find the best two alignments
1167  // (one thread per active read).
1168  log_debug(stderr, "[%u] score reduce\n", ID);
1170  reduce_context,
1171  pipeline,
1172  params );
1173 
1175  nvbio::cuda::check_error("score-reduce kernel");
1176 
1177  // keep track of the number of extensions performed for each of the active reads
1178  n_ext += pipeline.n_hits_per_read;
1179 
1180  device_timer.stop();
1181  timer.stop();
1182  stats.score.add( pipeline.hits_queue_size, score_time + timer.seconds(), dev_score_time + device_timer.seconds() );
1183  }
1184 
1186  global_timer.stop();
1187  stats.scoring_pipe.add( seed_queue_size, global_timer.seconds(), global_timer.seconds() );
1188 }
1189 
1190 } // namespace cuda
1191 } // namespace bowtie2
1192 } // namespace nvbio