NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
nvbio-aln-diff.cpp
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
31 #include <nvbio-aln-diff/utils.h>
32 #include <nvbio/basic/types.h>
33 #include <nvbio/basic/console.h>
34 #include <nvbio/basic/html.h>
36 #include <cuda_runtime_api.h>
37 #include <vector_types.h>
38 #include <vector_functions.h>
39 #include <stdio.h>
40 #include <stdlib.h>
41 #include <string.h>
42 #include <vector>
43 #include <string>
44 
45 void crcInit();
46 
47 using namespace nvbio;
48 using namespace alndiff;
49 
50 
51 int main(int argc, char* argv[])
52 {
53  crcInit();
54 
55  int cuda_device = -1;
56  int device_count;
57  cudaGetDeviceCount(&device_count);
58  log_verbose(stderr, " cuda devices : %d\n", device_count);
59 
60  const char* report_name = NULL;
61 
62  const char* filter_name = NULL;
63  uint32 filter_flags = Filter::ALL;
64  uint32 filter_stats = Filter::ALL;
65  int32 filter_delta = 5;
66  bool paired = false;
67  bool id_check = true;
68 
69  int arg = 1;
70  while (arg < argc)
71  {
72  if (strcmp( argv[arg], "-device" ) == 0)
73  {
74  cuda_device = atoi(argv[++arg]);
75  ++arg;
76  }
77  else if (strcmp( argv[arg], "-paired" ) == 0)
78  {
79  paired = true;
80  ++arg;
81  }
82  else if (strcmp( argv[arg], "-report" ) == 0)
83  {
84  report_name = argv[++arg];
85  ++arg;
86  }
87  else if (strcmp( argv[arg], "-no-ids" ) == 0)
88  {
89  id_check = false;
90  ++arg;
91  }
92  else if (strcmp( argv[arg], "-filter" ) == 0)
93  {
94  filter_name = argv[++arg];
95 
96  const std::string flags_string( argv[++arg] );
97  const std::string stats_string( argv[++arg] );
98  filter_delta = atoi(argv[++arg]);
99  ++arg;
100 
101  char temp[256];
102 
103  // parse the filter flags
104  const char* begin = flags_string.c_str();
105  const char* end = begin;
106  filter_flags = 0u;
107 
108  while (1)
109  {
110  while (*end != '|' && *end != '\0')
111  {
112  temp[end - begin] = *end;
113  end++;
114  }
115 
116  temp[end - begin] = '\0';
117 
118  if (strcmp( temp, "distant" ) == 0)
119  filter_flags |= Filter::DISTANT;
120  else if (strcmp( temp, "discordant" ) == 0)
121  filter_flags |= Filter::DISCORDANT;
122  else if (strcmp( temp, "diff-ref" ) == 0)
123  filter_flags |= Filter::DIFFERENT_REF;
124 
125  if (*end == '\0')
126  break;
127 
128  ++end; begin = end;
129  }
130 
131  // parse the filter stats
132  begin = stats_string.c_str();
133  end = begin;
134  filter_stats = 0u;
135 
136  while (1)
137  {
138  while (*end != '|' && *end != '\0')
139  {
140  temp[end - begin] = *end;
141  end++;
142  }
143 
144  temp[end - begin] = '\0';
145 
146  if (strcmp( temp, "ed" ) == 0)
147  filter_stats |= Filter::ED;
148  else if (strcmp( temp, "mapQ" ) == 0)
149  filter_stats |= Filter::MAPQ;
150  else if (strcmp( temp, "mms" ) == 0)
151  filter_stats |= Filter::MMS;
152  else if (strcmp( temp, "ins" ) == 0)
153  filter_stats |= Filter::INS;
154  else if (strcmp( temp, "dels" ) == 0)
155  filter_stats |= Filter::DELS;
156 
157  if (*end == '\0')
158  break;
159 
160  ++end; begin = end;
161  }
162  }
163  else
164  break;
165  }
166 
167  // inspect and select cuda devices
168  if (device_count)
169  {
170  if (cuda_device == -1)
171  {
172  int best_device = 0;
173  cudaDeviceProp best_device_prop;
174  cudaGetDeviceProperties( &best_device_prop, best_device );
175 
176  for (int device = 0; device < device_count; ++device)
177  {
178  cudaDeviceProp device_prop;
179  cudaGetDeviceProperties( &device_prop, device );
180  log_verbose(stderr, " device %d has compute capability %d.%d\n", device, device_prop.major, device_prop.minor);
181  log_verbose(stderr, " SM count : %u\n", device_prop.multiProcessorCount);
182  log_verbose(stderr, " SM clock rate : %u Mhz\n", device_prop.clockRate / 1000);
183  log_verbose(stderr, " memory clock rate : %.1f Ghz\n", float(device_prop.memoryClockRate) * 1.0e-6f);
184 
185  if (device_prop.major >= best_device_prop.major &&
186  device_prop.minor >= best_device_prop.minor)
187  {
188  best_device_prop = device_prop;
189  best_device = device;
190  }
191  }
192  cuda_device = best_device;
193  }
194  log_verbose(stderr, " chosen device %d\n", cuda_device);
195  {
196  cudaDeviceProp device_prop;
197  cudaGetDeviceProperties( &device_prop, cuda_device );
198  log_verbose(stderr, " device name : %s\n", device_prop.name);
199  log_verbose(stderr, " compute capability : %d.%d\n", device_prop.major, device_prop.minor);
200  }
201  cudaSetDevice( cuda_device );
202  }
203 
204  if (argc < arg + 2)
205  {
206  log_info(stderr, "nvbio-aln-diff [OPTIONS] <file1> <file2>\n");
207  log_info(stderr, "OPTIONS:\n");
208  log_info(stderr, " -paired # paired-end input\n" );
209  log_info(stderr, " -no-ids # do not perform id checks\n" );
210  log_info(stderr, " -report <file-name> # HTML report\n" );
211  log_info(stderr, " -filter <file-name>\n" );
212  log_info(stderr, " <flags={distant|discordant|diff-ref}>\n" );
213  log_info(stderr, " <stats={ed|mapQ|mms|ins|dels}>\n" );
214  log_info(stderr, " int:delta\n" );
215  return 0;
216  }
217 
218  if (filter_name)
219  {
220  std::string filter_flags_string;
221  if (filter_flags & Filter::DISTANT)
222  filter_flags_string += std::string(filter_flags_string.length() ? "|" : "") + "distant";
223  if (filter_flags & Filter::DISCORDANT)
224  filter_flags_string += std::string(filter_flags_string.length() ? "|" : "") + "discordant";
225  if (filter_flags & Filter::DIFFERENT_REF)
226  filter_flags_string += std::string(filter_flags_string.length() ? "|" : "") + "diff-ref";
227 
228  std::string filter_stats_string;
229  if (filter_stats & Filter::ED)
230  filter_stats_string += std::string(filter_stats_string.length() ? "|" : "") + "ed";
231  if (filter_stats & Filter::MAPQ)
232  filter_stats_string += std::string(filter_stats_string.length() ? "|" : "") + "mapQ";
233  if (filter_stats & Filter::MMS)
234  filter_stats_string += std::string(filter_stats_string.length() ? "|" : "") + "mms";
235  if (filter_stats & Filter::INS)
236  filter_stats_string += std::string(filter_stats_string.length() ? "|" : "") + "ins";
237  if (filter_stats & Filter::DELS)
238  filter_stats_string += std::string(filter_stats_string.length() ? "|" : "") + "dels";
239 
240  log_info(stderr, "filter file : %s\n", filter_name);
241  log_info(stderr, " flags : %s\n", filter_flags_string.c_str());
242  log_info(stderr, " stats : %s\n", filter_stats_string.c_str());
243  log_info(stderr, " delta : %d\n", filter_delta);
244  }
245 
246  if (argc == arg + 2)
247  {
248  const char *aln_file_nameL = argv[arg];
249  const char *aln_file_nameR = argv[arg+1];
250 
251  if (paired)
252  {
253  const char *aln_file_nameL = argv[arg];
254  const char *aln_file_nameR = argv[arg+1];
255 
258 
259  if (aln_streamL == NULL || aln_streamL->is_ok() == false) { log_error(stderr, "failed opening \"%s\"\n", aln_file_nameL); exit(1); }
260  if (aln_streamR == NULL || aln_streamR->is_ok() == false) { log_error(stderr, "failed opening \"%s\"\n", aln_file_nameR); exit(1); }
261 
262  const uint32 BATCH_SIZE = 500000;
263  std::vector<Alignment> batchL( BATCH_SIZE );
264  std::vector<Alignment> batchR( BATCH_SIZE );
265 
266  Filter filter( filter_name, filter_flags, filter_stats, filter_delta );
267  SharedPointer<PEAnalyzer> analyzer = SharedPointer<PEAnalyzer>( new PEAnalyzer( filter, id_check ) );
268 
269  uint32 n_batch = 0;
270  while (1)
271  {
272  const uint32 batch_sizeL = aln_streamL->next_batch( BATCH_SIZE, &batchL[0] );
273  const uint32 batch_sizeR = aln_streamR->next_batch( BATCH_SIZE, &batchR[0] );
274 
275  const uint32 min_batch_size = nvbio::min( batch_sizeL, batch_sizeR );
276  const uint32 max_batch_size = nvbio::max( batch_sizeL, batch_sizeR );
277 
278  if (min_batch_size != max_batch_size)
279  log_warning(stderr, "alignment files have different size\n");
280 
281  log_info(stderr, "analizing batch[%u]: %u alignments (%.1f M)\n", n_batch, min_batch_size, float(n_batch * BATCH_SIZE + min_batch_size)*1.0e-6f);
282 
283  for (uint32 i = 0; i < min_batch_size; i += 2)
284  {
285  const Alignment* alnL1 = &batchL[i];
286  const Alignment* alnL2 = &batchL[i+1];
287  const Alignment* alnR1 = &batchR[i];
288  const Alignment* alnR2 = &batchR[i+1];
289 
290  if (alnL1->is_mapped() && alnL1->mate == alnL2->mate)
291  {
292  log_error(stderr, "alignments %u and %u in \"%s\" refer to the same mate, must come from different reads\n", i, i+1, aln_file_nameL);
293  return 1;
294  }
295  if (alnR1->is_mapped() && alnR1->mate == alnR2->mate)
296  {
297  log_error(stderr, "alignments %u and %u in \"%s\" refer to the same mate, must come from different reads\n", i, i+1, aln_file_nameR);
298  return 1;
299  }
300 
301  if (alnL1->mate) std::swap( alnL1, alnL2 );
302  if (alnR1->mate) std::swap( alnR1, alnR2 );
303 
304  analyzer->push(
305  AlignmentPair( *alnL1, *alnL2 ),
306  AlignmentPair( *alnR1, *alnR2 ));
307  }
308 
309  if (report_name)
310  analyzer->generate_report( aln_file_nameL, aln_file_nameR, report_name );
311 
312  analyzer->flush();
313 
314  log_verbose(stderr, " mismatched : %5.2f%%\n", 100.0f * analyzer->mismatched());
315  log_verbose(stderr, " mapped [L] : %5.2f%%\n", 100.0f * analyzer->mapped.avg_L());
316  log_verbose(stderr, " mapped [R] : %5.2f%%\n", 100.0f * analyzer->mapped.avg_R());
317  log_verbose(stderr, " mapped [L&R] : %5.2f%%\n", 100.0f * analyzer->mapped.avg_L_and_R());
318  log_verbose(stderr, " mapped/unmapped [L] : %5.2f%%\n", 100.0f * analyzer->mapped.avg_L_not_R());
319  log_verbose(stderr, " mapped/unmapped [R] : %5.2f%%\n", 100.0f * analyzer->mapped.avg_R_not_L());
320  log_verbose(stderr, " paired [L] : %5.2f%%\n", 100.0f * analyzer->paired.avg_L());
321  log_verbose(stderr, " paired [R] : %5.2f%%\n", 100.0f * analyzer->paired.avg_R());
322  log_verbose(stderr, " paired [L&R] : %5.2f%%\n", 100.0f * analyzer->paired.avg_L_and_R());
323  log_verbose(stderr, " paired/unpaired [L] : %5.2f%%\n", 100.0f * analyzer->paired.avg_L_not_R());
324  log_verbose(stderr, " paired/unpaired [R] : %5.2f%%\n", 100.0f * analyzer->paired.avg_R_not_L());
325  log_verbose(stderr, " different ref : %5.2f%%\n", 100.0f * analyzer->different_ref());
326  log_verbose(stderr, " distant : %5.2f%%\n", 100.0f * analyzer->distant());
327  log_verbose(stderr, " discordant : %5.2f%%\n", 100.0f * analyzer->discordant());
328  log_verbose(stderr, " filtered : %u\n", analyzer->filtered());
329 
330  if (min_batch_size < BATCH_SIZE)
331  break;
332 
333  ++n_batch;
334  }
335  }
336  else
337  {
340 
341  if (aln_streamL == NULL || aln_streamL->is_ok() == false) { log_error(stderr, "failed opening \"%s\"\n", aln_file_nameL); exit(1); }
342  if (aln_streamR == NULL || aln_streamR->is_ok() == false) { log_error(stderr, "failed opening \"%s\"\n", aln_file_nameR); exit(1); }
343 
344  const uint32 BATCH_SIZE = 500000;
345  std::vector<Alignment> batchL( BATCH_SIZE );
346  std::vector<Alignment> batchR( BATCH_SIZE );
347 
348  Filter filter( filter_name, filter_flags, filter_stats, filter_delta );
350 
351  uint32 n_batch = 0;
352  while (1)
353  {
354  const uint32 batch_sizeL = aln_streamL->next_batch( BATCH_SIZE, &batchL[0] );
355  const uint32 batch_sizeR = aln_streamR->next_batch( BATCH_SIZE, &batchR[0] );
356 
357  if (batch_sizeL != batch_sizeR)
358  log_warning(stderr, "alignment files have different size\n");
359 
360  const uint32 min_batch_size = nvbio::min( batch_sizeL, batch_sizeR );
361 
362  log_info(stderr, "analizing batch[%u]: %u alignments (%.1f M)\n", n_batch, min_batch_size, float(n_batch * BATCH_SIZE + min_batch_size)*1.0e-6f);
363 
364  for (uint32 i = 0; i < min_batch_size; ++i)
365  analyzer->push( batchL[i], batchR[i] );
366 
367  if (report_name)
368  analyzer->generate_report( aln_file_nameL, aln_file_nameR, report_name );
369 
370  analyzer->flush();
371 
372  log_verbose(stderr, " mismatched : %5.2f%%\n", 100.0f * analyzer->mismatched());
373  log_verbose(stderr, " mapped [L] : %5.2f%%\n", 100.0f * analyzer->mapped.avg_L());
374  log_verbose(stderr, " mapped [R] : %5.2f%%\n", 100.0f * analyzer->mapped.avg_R());
375  log_verbose(stderr, " mapped [L&R] : %5.2f%%\n", 100.0f * analyzer->mapped.avg_L_and_R());
376  log_verbose(stderr, " mapped/unmapped [L] : %5.2f%%\n", 100.0f * analyzer->mapped.avg_L_not_R());
377  log_verbose(stderr, " mapped/unmapped [R] : %5.2f%%\n", 100.0f * analyzer->mapped.avg_R_not_L());
378  log_verbose(stderr, " different ref : %5.2f%%\n", 100.0f * analyzer->different_ref());
379  log_verbose(stderr, " distant : %5.2f%%\n", 100.0f * analyzer->distant());
380  log_verbose(stderr, " discordant : %5.2f%%\n", 100.0f * analyzer->discordant());
381  log_verbose(stderr, " filtered : %u\n", analyzer->filtered());
382 
383  if (min_batch_size < BATCH_SIZE)
384  break;
385 
386  ++n_batch;
387  }
388  }
389  }
390  else if (argc == arg + 4)
391  {
392  const char *aln_file_nameL1 = argv[arg];
393  const char *aln_file_nameL2 = argv[arg+1];
394  const char *aln_file_nameR1 = argv[arg+2];
395  const char *aln_file_nameR2 = argv[arg+3];
396 
397  const char* aln_file_nameL = aln_file_nameL1;
398  const char* aln_file_nameR = aln_file_nameR1;
399 
404 
405  if (aln_streamL1 == NULL || aln_streamL1->is_ok() == false) { log_error(stderr, "failed opening \"%s\"\n", aln_file_nameL1); exit(1); }
406  if (aln_streamL2 == NULL || aln_streamL2->is_ok() == false) { log_error(stderr, "failed opening \"%s\"\n", aln_file_nameL2); exit(1); }
407 
408  if (aln_streamR2 == NULL || aln_streamR1->is_ok() == false) { log_error(stderr, "failed opening \"%s\"\n", aln_file_nameR1); exit(1); }
409  if (aln_streamR2 == NULL || aln_streamR2->is_ok() == false) { log_error(stderr, "failed opening \"%s\"\n", aln_file_nameR2); exit(1); }
410 
411  const uint32 BATCH_SIZE = 500000;
412  std::vector<Alignment> batchL1( BATCH_SIZE );
413  std::vector<Alignment> batchL2( BATCH_SIZE );
414  std::vector<Alignment> batchR1( BATCH_SIZE );
415  std::vector<Alignment> batchR2( BATCH_SIZE );
416 
417  Filter filter( filter_name, filter_flags, filter_stats, filter_delta );
418  SharedPointer<PEAnalyzer> analyzer = SharedPointer<PEAnalyzer>( new PEAnalyzer( filter, id_check ) );
419 
420  uint32 n_batch = 0;
421  while (1)
422  {
423  const uint32 batch_sizeL1 = aln_streamL1->next_batch( BATCH_SIZE, &batchL1[0] );
424  const uint32 batch_sizeL2 = aln_streamL2->next_batch( BATCH_SIZE, &batchL2[0] );
425  const uint32 batch_sizeR1 = aln_streamR1->next_batch( BATCH_SIZE, &batchR1[0] );
426  const uint32 batch_sizeR2 = aln_streamR2->next_batch( BATCH_SIZE, &batchR2[0] );
427 
428  const uint32 min_batch_size = nvbio::min(
429  nvbio::min( batch_sizeL1, batch_sizeL2 ),
430  nvbio::min( batch_sizeR1, batch_sizeR2 ) );
431 
432  const uint32 max_batch_size = nvbio::min(
433  nvbio::max( batch_sizeL1, batch_sizeL2 ),
434  nvbio::max( batch_sizeR1, batch_sizeR2 ) );
435 
436  if (min_batch_size != max_batch_size)
437  log_warning(stderr, "alignment files have different size\n");
438 
439  log_info(stderr, "analizing batch[%u]: %u alignments (%.1f M)\n", n_batch, min_batch_size, float(n_batch * BATCH_SIZE + min_batch_size)*1.0e-6f);
440 
441  for (uint32 i = 0; i < min_batch_size; ++i)
442  analyzer->push(
443  AlignmentPair( batchL1[i], batchL2[i] ),
444  AlignmentPair( batchR1[i], batchR2[i] ));
445 
446  if (report_name)
447  analyzer->generate_report( aln_file_nameL, aln_file_nameR, report_name );
448 
449  analyzer->flush();
450 
451  log_verbose(stderr, " mismatched : %5.2f%%\n", 100.0f * analyzer->mismatched());
452  log_verbose(stderr, " mapped [L] : %5.2f%%\n", 100.0f * analyzer->mapped.avg_L());
453  log_verbose(stderr, " mapped [R] : %5.2f%%\n", 100.0f * analyzer->mapped.avg_R());
454  log_verbose(stderr, " mapped [L&R] : %5.2f%%\n", 100.0f * analyzer->mapped.avg_L_and_R());
455  log_verbose(stderr, " mapped/unmapped [L] : %5.2f%%\n", 100.0f * analyzer->mapped.avg_L_not_R());
456  log_verbose(stderr, " mapped/unmapped [R] : %5.2f%%\n", 100.0f * analyzer->mapped.avg_R_not_L());
457  log_verbose(stderr, " paired [L] : %5.2f%%\n", 100.0f * analyzer->paired.avg_L());
458  log_verbose(stderr, " paired [R] : %5.2f%%\n", 100.0f * analyzer->paired.avg_R());
459  log_verbose(stderr, " paired [L&R] : %5.2f%%\n", 100.0f * analyzer->paired.avg_L_and_R());
460  log_verbose(stderr, " paired/unpaired [L] : %5.2f%%\n", 100.0f * analyzer->paired.avg_L_not_R());
461  log_verbose(stderr, " paired/unpaired [R] : %5.2f%%\n", 100.0f * analyzer->paired.avg_R_not_L());
462  log_verbose(stderr, " different ref : %5.2f%%\n", 100.0f * analyzer->different_ref());
463  log_verbose(stderr, " distant : %5.2f%%\n", 100.0f * analyzer->distant());
464  log_verbose(stderr, " discordant : %5.2f%%\n", 100.0f * analyzer->discordant());
465  log_verbose(stderr, " filtered : %u\n", analyzer->filtered());
466 
467  if (min_batch_size < BATCH_SIZE)
468  break;
469 
470  ++n_batch;
471  }
472  }
473 
474  cudaDeviceReset();
475  return 0;
476 }
477