NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
vcf.cpp
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 // loader for variant call format files, version 4.2
29 
30 #include <nvbio/basic/console.h>
31 #include <nvbio/io/vcf.h>
33 #include <nvbio/basic/dna.h>
34 
35 #include <stdlib.h>
36 #include <string.h>
37 
38 namespace nvbio {
39 namespace io {
40 
41 // parse the INFO field looking for an END tag
42 // INFO is a set of ID=val entries separated by semicolons
43 // returns false if a parse error occurs
44 static bool get_end_position(uint32 *out, char *info)
45 {
46  char *sc, *eq;
47 
48  do {
49  // search for the next semi-colon
50  sc = strchr(info, ';');
51  if (sc)
52  {
53  // null it out
54  *sc = '\0';
55  }
56 
57  // now search for the next equal sign
58  eq = strchr(info, '=');
59  if (!eq)
60  {
61  // no equal sign, malformed header
62  return false;
63  }
64 
65  // zero out the equal sign
66  *eq = 0;
67 
68  // check the key name
69  if (strcmp(info, "END") == 0)
70  {
71  // parse the END value
72  char *endptr = NULL;
73  uint32 position = strtoll(eq + 1, &endptr, 10);
74  if (!endptr || endptr == eq || *endptr != '\0')
75  {
76  return false;
77  }
78 
79  *out = position;
80  return true;
81  }
82 
83  if (sc)
84  {
85  info = sc + 1;
86  } else {
87  info = NULL;
88  }
89  } while (info && *info);
90 
91  return true;
92 }
93 
94 // loads a VCF 4.2 file, appending the data to output
95 bool loadVCF(SNPDatabase& output, const char *file_name)
96 {
97  BufferedTextFile file(file_name);
98  char *line, *end;
99  uint32 line_counter = 0;
100 
101  while((line = file.next_record(&end)))
102  {
103  line_counter++;
104  *end = '\0';
105 
106  // strip out comments
107  char *comment = strchr(line, '#');
108  if (comment)
109  *comment = '\0';
110 
111  // skip all leading whitespace
112  while (*line == ' ' || *line == '\t' || *line == '\r')
113  {
114  line++;
115  }
116 
117  if (*line == '\0')
118  {
119  // empty line, skip
120  continue;
121  }
122 
123  // parse the entries in each record
124  char *chrom = NULL;
125  char *pos = NULL;
126  char *id = NULL;
127  char *ref = NULL;
128  char *alt = NULL;
129  char *qual = NULL;
130  char *filter = NULL;
131  char *info = NULL;
132 
133 // ugly macro to tokenize the string based on strchr
134 #define NEXT(prev, next) \
135  { \
136  if (prev) \
137  { \
138  next = strchr(prev, '\t'); \
139  if (next) \
140  { \
141  *next = '\0'; \
142  next++; \
143  } \
144  } \
145  }
146 
147  chrom = line;
148  NEXT(chrom, pos);
149  NEXT(pos, id);
150  NEXT(id, ref);
151  NEXT(ref, alt);
152  NEXT(alt, qual);
153  NEXT(qual, filter);
154  NEXT(filter, info);
155 
156  if (!chrom || !pos || !id || !ref || !alt || !qual || !filter)
157  {
158  log_error(stderr, "Error parsing VCF file (line %d): incomplete variant\n", line_counter);
159  return false;
160  }
161 
162 #undef NEXT
163 
164  // convert position and quality
165  char *endptr = NULL;
166  uint32 position = strtoll(pos, &endptr, 10);
167  if (!endptr || endptr == pos || *endptr != '\0')
168  {
169  log_error(stderr, "VCF file error (line %d): invalid position\n", line_counter);
170  return false;
171  }
172 
173  uint8 quality;
174  if (*qual == '.')
175  {
176  quality = 0xff;
177  } else {
178  quality = (uint8) strtol(qual, &endptr, 10);
179  if (!endptr || endptr == qual || *endptr != '\0')
180  {
181  log_warning(stderr, "VCF file error (line %d): invalid quality\n", line_counter);
182  quality = 0xff;
183  }
184  }
185 
186  uint32 stop = position + strlen(ref);
187  // parse the info header looking for a stop position
188  if (info)
189  {
190  bool ret;
191  ret = get_end_position(&stop, info);
192  if (ret == false)
193  {
194  log_warning(stderr, "VCF file error (line %d): error parsing INFO line\n", line_counter);
195  return false;
196  }
197  }
198 
199  // add an entry for each possible variant listed in this record
200  do {
201  char *next_base = strchr(alt, ',');
202  if (next_base)
203  *next_base = '\0';
204 
205  char *var;
206  // if this is a called monomorphic variant (i.e., a site which has been identified as always having the same allele)
207  // we store the reference string as the variant
208  if (strcmp(alt, ".") == 0)
209  var = ref;
210  else
211  var = alt;
212 
213  const uint32 ref_len = strlen(ref);
214  const uint32 var_len = strlen(var);
215 
216  SNP_sequence_index index(output.reference_sequences.size(), ref_len,
217  output.variants.size(), var_len);
218  output.ref_variant_index.push_back(index);
219 
220  output.reference_sequence_names.push_back(std::string(chrom));
221  output.sequence_positions.push_back(make_uint2(position, stop));
222 
223  output.reference_sequences.resize(index.reference_start + ref_len);
224  string_to_iupac16(ref, output.reference_sequences.begin() + index.reference_start);
225 
226  output.variants.resize(index.variant_start + var_len);
227  string_to_iupac16(var, output.variants.begin() + index.variant_start);
228 
229  output.variant_qualities.push_back(quality);
230 
231  if (next_base)
232  alt = next_base + 1;
233  else
234  alt = NULL;
235  } while (alt && *alt != '\0');
236  }
237 
238  return true;
239 }
240 
241 } // namespace io
242 } // namespace nvbio