NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
haplotype_caller.h
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 
29 #pragma once
30 
31 #include "assembly_types.h"
32 #include "bam_io.h"
33 #include "kmers.h"
34 #include "regions.h"
35 #include "assembly_graph.h"
36 
37 using namespace nvbio;
38 
42 {
43  // ---- Parameters ----
44 
45  const static uint32 kmer_size = 10;
46  const static uint32 k_best_haplotypes = 12;
47  const static uint32 active_region_size = 100;
48  const static bool increaze_kmer_size_for_cycles = false;
49  const static uint32 max_reads_in_region_per_sample = 1000;
50  const static uint32 min_reads_per_alignment_start = 5;
51  const static uint32 max_ref_size = 1000;
52  const static uint32 ref_padding = 100;
53 
54  // ---- Data ----
55 
56  // reference genome
59 
60  // active regions
63 
64  // read sequences
68 
69  // read correction
70 
71  // assembly results
72 
73 
74  // load the reference genome on the device
76  d_ref = h_ref;
77  const io::SequenceDataAccess<DNA> d_ref_access(d_ref);
78  d_ref_seqs = d_ref_access.sequence_string_set();
79  }
80 
82  const BAM_alignment_batch_SoA& read_batch,
83  const H_VectorActiveRegions active_regions)
84  {
85  // prepare the ref haplotypes and the reads overlapping the regions
86  const io::SequenceDataAccess<DNA_N> h_ref_access(h_ref);
88  uint64 n_seqs = read_batch.num_alns + active_regions.size(); // includes the ref sequences
89  H_VectorDNA h_seqs;
90  H_VectorU64 h_offsets(n_seqs+1);
91  H_VectorU32 h_active_region_ids(n_seqs);
92  H_VectorU32 h_active_region_offsets(active_regions.size());
93  uint64 offset = 0u;
94 
95  // load each active region
96  uint64 n_seqs_loaded = 0;
97  for(uint32 r = 0; r < active_regions.size(); r++) {
98  h_active_region_offsets[r] = n_seqs_loaded;
99  // reference haplotype
100  active_region region = active_regions[r];
102  for(uint32 i = region.genome_loc.start; i <= region.genome_loc.stop; i++) {
103  h_seqs.push_back(char_to_dna(nvbio::to_char<DNA_N>(seq[i])));
104  }
105  h_active_region_ids[n_seqs_loaded] = r;
106  h_offsets[n_seqs_loaded] = offset;
107  offset += region.genome_loc.stop - region.genome_loc.start + 1;
108  n_seqs_loaded++;
109 
110  // reads overlapping the region
111  for(uint64 i = region.read_batch_offset; i < region.read_batch_offset + region.n_reads; i++) {
112  for(uint32 j = 0; j < read_batch.crq_index[i].read_len; j += 2) {
113  uint8 c1 = read_batch.reads[read_batch.crq_index[i].read_start + j];
114  uint8 c2 = read_batch.reads[read_batch.crq_index[i].read_start + j + 1];
117  }
118  h_offsets[n_seqs_loaded] = offset;
119  h_active_region_ids[n_seqs_loaded] = r;
120  offset += read_batch.crq_index[i].read_len;
121  n_seqs_loaded++;
122  }
123  }
124  h_offsets[n_seqs_loaded] = offset;
125 
126  cuda::Timer timer;
127  timer.start();
128  d_active_region_ids = h_active_region_ids;
129  //d_active_region_offsets = h_active_region_offsets;
130  d_set_seqs_data = h_seqs;
131  D_StreamDNA stream((uint32*) plain_view(d_set_seqs_data.m_storage));
132  d_set_offsets = h_offsets;
133  d_seq_set = D_SequenceSet(n_seqs_loaded, stream, nvbio::plain_view(d_set_offsets));
134  timer.stop();
135  printf("Total input data transfer time : %.8fs\n", timer.seconds());
136  printf("Loaded %llu sequences \n", n_seqs_loaded);
137  }
138 };