35 typename scoring_scheme_type>
36 void Aligner::best_exact(
40 const scoring_scheme_type& scoring_scheme,
41 const io::FMIndexDataCUDA& driver_data,
47 typedef uint4 read_storage_type4;
51 typedef read_storage_type read_storage_type_A;
52 typedef read_base_type read_base_type_A;
55 typedef read_storage_type4 read_storage_type_B;
56 typedef read_base_type4 read_base_type_B;
58 typedef read_storage_type read_storage_type_B;
59 typedef read_base_type read_base_type_B;
65 const uint32 count = read_data.size();
71 thrust::make_counting_iterator(0u),
72 thrust::make_counting_iterator(0u) + count,
73 seed_queue_dvec.begin() );
75 seed_queue_size_dvec[0] = count;
91 seed_queues.in_queue = seed_queue_dptr;
92 seed_queues.out_size = NULL;
93 seed_queues.out_queue = NULL;
96 thrust::fill( cnts_dvec.begin(), cnts_dvec.end(),
uint32(0) );
98 Batch<read_base_type_A,const char*> reads_A(
100 read_base_type_A( (
const read_storage_type_A*)read_data.read_stream() ),
101 read_data.read_index(),
102 read_data.qual_stream());
113 0u, seed_queues.device(),
114 hits_dptr, cnts_dptr,
119 hits_dptr, cnts_dptr,
122 cudaThreadSynchronize();
126 stats.map.add( seed_queues.in_size, timer.seconds() );
130 if (params.keep_stats)
134 seed_queues.out_size = seed_queue_size_dptr;
135 seed_queues.out_queue = seed_queue_dptr +
BATCH_SIZE;
137 const uint32 max_seeds = read_data.m_max_read_len / params.seed_len;
139 thrust::device_vector<SeedHit> hits_level_dvec( BATCH_SIZE );
140 thrust::device_vector<uint32> cnts_level_dvec( BATCH_SIZE );
142 for (
uint32 seed_idx = 0; seed_queues.in_size && seed_idx < max_seeds; ++seed_idx)
148 bowtie2_hits_stats( read_data.size(), hits_dptr, cnts_dptr,
hits_stats_dptr );
150 cudaThreadSynchronize();
153 cudaThreadSynchronize();
159 fprintf(stderr,
"\nseed idx: %u (%u active reads - %.2f M global hits)\n", seed_idx, seed_queues.in_size,
float(n_hits) * 1.0e-6f);
163 reads_A, hits_dptr, cnts_dptr,
164 thrust::raw_pointer_cast( &hits_level_dvec.front() ),
165 thrust::raw_pointer_cast( &cnts_level_dvec.front() ) );
167 cudaThreadSynchronize();
179 thrust::raw_pointer_cast( &hits_level_dvec.front() ),
180 thrust::raw_pointer_cast( &cnts_level_dvec.front() ),
184 seed_queues.clear_output();
186 bowtie2_prune_search(
188 seed_queues.device(),
191 cudaThreadSynchronize();
204 Batch<read_base_type_B,const char*> reads_B(
206 read_base_type_B( (
const read_storage_type_B*)read_data.read_stream() ),
207 read_data.read_index(),
208 read_data.qual_stream());
215 BacktrackBestContext<0,edit_distance_scoring> context(
220 read_data.max_read_len(),
225 driver_data.genome_length(),
226 driver_data.genome_stream(),
234 cudaThreadSynchronize();
238 stats.backtrack.add( count, timer.seconds() );
246 thrust::device_vector<uint32>& second_idx_dvec = loc_queue_dvec;
250 thrust::make_counting_iterator(0u),
251 thrust::make_counting_iterator(0u) + count,
253 second_idx_dvec.begin(),
254 has_second() ) - second_idx_dvec.begin() );
263 uint32* second_idx = thrust::raw_pointer_cast( &second_idx_dvec[0] );
265 BacktrackBestContext<1,edit_distance_scoring> context(
270 read_data.max_read_len(),
275 driver_data.genome_length(),
276 driver_data.genome_stream(),
284 cudaThreadSynchronize();
288 stats.backtrack.add( n_second, timer.seconds() );
303 typename scoring_scheme_type>
304 void Aligner::best_exact_score(
305 const Params& params,
308 const scoring_scheme_type& scoring_scheme,
309 const io::FMIndexDataCUDA& driver_data,
312 HostQueues<uint32>& seed_queues,
313 SeedHit* hits_level_dptr,
317 #if USE_UINT4_PACKING
327 bowtie2_hits_stats( read_data.size(), hits_dptr, cnts_dptr,
hits_stats_dptr );
329 cudaThreadSynchronize();
340 const uint32 count = read_data.size();
343 Batch<read_base_type,const char*> reads(
345 read_base_type( (
const read_storage_type*)read_data.read_stream() ),
346 read_data.read_index(),
347 read_data.qual_stream());
360 HostQueues<uint32> hit_queues;
361 hit_queues.in_size = seed_queues.in_size;
362 hit_queues.in_queue = hits_queue_dptr;
363 hit_queues.out_size = hits_queue_size_dptr;
364 hit_queues.out_queue = hits_queue_dptr +
BATCH_SIZE;
366 cudaMemcpy( hit_queues.in_queue, seed_queues.in_queue,
sizeof(
uint32) * hit_queues.in_size, cudaMemcpyDeviceToDevice );
368 bool first_run =
true;
370 SelectBestExactContext select_context(
374 seed_idx == 0 ?
true :
false );
379 uint32 processed_hits = 0;
381 while (hit_queues.in_size)
385 hits_queue_dvec.begin() + (hit_queues.in_queue - hits_queue_dptr),
386 hits_queue_dvec.begin() + (hit_queues.in_queue - hits_queue_dptr) + hit_queues.in_size );
388 fprintf(stderr,
"\rcount: %u (%.2f / %.2f M hits processed) ", hit_queues.in_size,
float(processed_hits) * 1.0e-6f,
float(hits_to_process) * 1.0e-6f);
389 hit_queues.clear_output();
397 if (hit_queues.in_size > 128u*1024u)
412 reads, fmi, rfmi, hits_level_dptr, cnts_level_dptr,
417 hit_queues.out_queue,
424 cudaThreadSynchronize();
428 stats.select.add( hit_queues.in_size, timer.seconds() );
432 if (hit_queues.in_size == 0)
435 processed_hits += hit_queues.in_size;
441 thrust::make_counting_iterator(0u),
442 thrust::make_counting_iterator(0u) + hit_queues.in_size,
446 loc_queue_dvec.begin(),
447 loc_queue_dvec.begin() + hit_queues.in_size,
451 stats.sort.add( hit_queues.in_size, timer.seconds() );
465 cudaThreadSynchronize();
469 stats.locate.add( hit_queues.in_size, timer.seconds() );
477 loc_queue_dvec.begin(),
478 loc_queue_dvec.begin() + hit_queues.in_size,
482 stats.sort.add( hit_queues.in_size, timer.seconds() );
487 if (hit_queues.in_size)
491 ScoreBestContext<edit_distance_scoring> context(
506 driver_data.genome_length(),
507 driver_data.genome_stream(),
510 cudaThreadSynchronize();
513 const ReduceBestExactContext reduce_context;
515 bowtie2_score_reduce<edit_distance_scoring>(
516 reads, reduce_context, hits_level_dptr, cnts_level_dptr,
524 cudaThreadSynchronize();
528 stats.score.add( hit_queues.in_size, timer.seconds() );
541 score_output_count_dvec[0] = 0;
545 const uint32 max_ext = 4096;
550 (256u*1024u) / hit_queues.in_size,
553 const uint32 score_output_queue_stride = hit_queues.in_size;
555 bowtie2_select_multi(
557 reads, fmi, rfmi, hits_level_dptr, cnts_level_dptr,
562 hit_queues.out_queue,
563 score_output_count_dptr,
566 score_parent_queue_dptr,
567 score_output_queue_dptr,
568 score_output_queue_stride,
574 cudaThreadSynchronize();
578 stats.select.add( hit_queues.in_size * n_multi, timer.seconds() );
584 const uint32 score_queue_size = score_output_count_dvec[0];
585 if (score_queue_size == 0)
588 processed_hits += score_queue_size;
594 thrust::make_counting_iterator(0u),
595 thrust::make_counting_iterator(0u) + score_queue_size,
599 loc_queue_dvec.begin(),
600 loc_queue_dvec.begin() + score_queue_size,
604 stats.sort.add( score_queue_size, timer.seconds() );
615 cudaThreadSynchronize();
619 stats.locate.add( score_queue_size, timer.seconds() );
627 loc_queue_dvec.begin(),
628 loc_queue_dvec.begin() + score_queue_size,
632 stats.sort.add( hit_queues.in_size, timer.seconds() );
637 if (score_queue_size)
641 ScoreBestContext<edit_distance_scoring> context(
642 score_parent_queue_dptr,
656 driver_data.genome_length(),
657 driver_data.genome_stream(),
660 cudaThreadSynchronize();
663 const ReduceBestExactContext reduce_context;
667 bowtie2_score_multi_reduce<edit_distance_scoring>(
668 reads, reduce_context, hits_level_dptr, cnts_level_dptr,
672 score_output_queue_stride,
673 score_output_queue_dptr,
674 score_parent_queue_dptr,
679 cudaThreadSynchronize();
683 stats.score.add( score_queue_size, timer.seconds() );