NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
sequence_sam.cpp
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include <stdlib.h>
29 #include <string.h>
30 #include <zlib/zlib.h>
31 
32 #include <nvbio/basic/console.h>
35 
36 namespace nvbio {
37 namespace io {
38 
40  const char* read_file_name,
41  const SequenceDataFile::Options& options)
42  : SequenceDataFile( options )
43 {
44  fp = gzopen(read_file_name, "rt");
45  if (fp == Z_NULL)
46  {
47  // this will cause init() to fail below
48  log_error(stderr, "unable to open SAM file %s\n", read_file_name);
50  } else {
52  }
53 
54  linebuf = NULL;
55  linebuf_size = 0;
56  line_length = 0;
57  numLines = 0;
58  version = NULL;
59  sortOrder = SortOrder_unknown;
60 }
61 
62 bool SequenceDataFile_SAM::readLine(void)
63 {
64  char *ret;
65  int start_file_pos;
66 
67  // current position in linebuf that we're writing into
68  // used when resizing the buffer
69  int cur_buf_pos = 0;
70 
71  if (!linebuf)
72  {
73  // allocate initial buffer
74  linebuf_size = LINE_BUFFER_INIT_SIZE;
75  linebuf = (char *) malloc(linebuf_size);
76 
77  if (linebuf == NULL)
78  {
79  log_error(stderr, "out of memory reading SAM file\n");
81  return false;
82  }
83  }
84 
85  // stash the (uncompressed) stream position we started reading from
86  start_file_pos = gztell(fp);
87 
88  for(;;)
89  {
90  // mark the next-to-last byte with a \0
91  // this serves to detect when a buffer is too small for a full line
92  linebuf[linebuf_size - 2] = '\0';
93 
94  // try to read a full line
95  ret = gzgets(fp, &linebuf[cur_buf_pos], linebuf_size - cur_buf_pos);
96  if (ret == Z_NULL)
97  {
98  // EOF
100  return false;
101  }
102 
103  if (linebuf[linebuf_size - 2] == '\0')
104  {
105  break;
106  } else {
107  // buffer is too small, double it
108  char *tmp;
109 
110  cur_buf_pos = linebuf_size - 1;
111 
112  linebuf_size *= 2;
113  tmp = (char *) realloc(linebuf, linebuf_size);
114  if (tmp == NULL)
115  {
116  log_error(stderr, "out of memory reading SAM file\n");
118  return false;
119  }
120 
121  linebuf = tmp;
122  }
123  }
124 
125  // store the byte size of the line we read
126  line_length = gztell(fp) - start_file_pos;
127 
128  // chop off the newline character if any
129  if (linebuf[line_length - 1] == '\n')
130  {
131  assert(linebuf[line_length] == '\0');
132  linebuf[line_length - 1] = '\0';
133  }
134 
135  numLines++;
136  return true;
137 }
138 
139 void SequenceDataFile_SAM::rewindLine(void)
140 {
141  assert(line_length);
142  gzseek(fp, -line_length, SEEK_CUR);
143 }
144 
145 // initializes a SAM file
146 // will consume the SAM header and prepare for fetching reads from the file
147 // returns false on failure
149 {
150  bool ret;
151 
152  if (m_file_state != FILE_OK)
153  {
154  // file failed to open
155  return false;
156  }
157 
158  // read the header section
159  do {
160  ret = readLine();
161  if (!ret)
162  {
163  return false;
164  }
165 
166  if (linebuf[0] != '@')
167  {
168  break;
169  }
170 
171  char *delim;
172  delim = strchr(linebuf, '\t');
173 
174  if (delim)
175  {
176  if (strncmp(linebuf, "@HD\t", strlen("@HD\t")) == 0)
177  {
178  ret = parseHeaderLine(delim + 1);
179  if (!ret)
180  {
181  return false;
182  }
183  } else if (strncmp(linebuf, "@SQ\t", strlen("@SQ\t")) == 0)
184  {
185  ret = parseReferenceSequenceLine(delim + 1);
186  if (!ret)
187  {
188  return false;
189  }
190  } else if (strncmp(linebuf, "@RG\t", strlen("@RG\t")) == 0) {
191  // ignored
192  continue;
193  } else if (strncmp(linebuf, "@PG\t", strlen("@PG\t")) == 0) {
194  // ignored
195  continue;
196  } else if (strncmp(linebuf, "@CO\t", strlen("@CO\t")) == 0) {
197  // ignored
198  continue;
199  } else {
200  log_warning(stderr, "SAM file warning: unknown header at line %d\n", numLines);
201  }
202  } else {
203  log_warning(stderr, "SAM file warning: malformed line %d\n", numLines);
204  }
205  } while(linebuf[0] == '@');
206 
207  // rewind the last line
208  rewindLine();
209 
210  return true;
211 }
212 
213 // parse a @HD line from the SAM file
214 // start points at the first tag of the line
215 bool SequenceDataFile_SAM::parseHeaderLine(char *start)
216 {
217  char *version = NULL;
218  char *delim;
219 
220  if (numLines != 1)
221  {
222  log_warning(stderr, "SAM file warning (line %d): @HD not the first line in the header section\n", numLines);
223  }
224 
225  for(;;)
226  {
227  // look for the next delimiter
228  delim = strchr(start, '\t');
229 
230  // zero out the next delimiter if found
231  if (delim)
232  {
233  *delim = 0;
234  }
235 
236  if (strncmp(start, "VN:", strlen("VN:")) == 0)
237  {
238  version = &start[3];
239  } else if (strncmp(start, "SO:", strlen("SO:")) == 0) {
240  if(strcmp(&start[3], "unknown") == 0)
241  {
242  sortOrder = SortOrder_unknown;
243  } else if (strcmp(&start[3], "unsorted") == 0) {
244  sortOrder = SortOrder_unsorted;
245  } else if (strcmp(&start[3], "queryname") == 0) {
246  sortOrder = SortOrder_queryname;
247  } else if (strcmp(&start[3], "coordinate") == 0) {
248  sortOrder = SortOrder_coordinate;
249  } else {
250  log_warning(stderr, "SAM file warning (line %d): invalid sort order %s\n", numLines, &start[3]);
251  }
252  } else {
253  log_warning(stderr, "SAM file warning (line %d): invalid tag %s in @HD\n", numLines, start);
254  }
255 
256  if (!delim)
257  {
258  // this was the last token
259  break;
260  }
261 
262  // advance to next token
263  start = delim + 1;
264  }
265 
266  if (version == NULL)
267  {
268  log_warning(stderr, "SAM file warning (line %d): header does not contain a version tag\n", numLines);
269  }
270 
271  return true;
272 }
273 
274 // parse a @SQ line from the SAM file
275 // start points at the first tag of the line
276 bool SequenceDataFile_SAM::parseReferenceSequenceLine(char *start)
277 {
278  char *seq_name = NULL;
279  char *seq_len = NULL;
280  char *delim;
281 
282  for(;;)
283  {
284  // look for the next delimiter
285  delim = strchr(start, '\t');
286 
287  // zero out the next delimiter if found
288  if (delim)
289  {
290  *delim = 0;
291  }
292 
293  if (strncmp(start, "SN:", strlen("SN:")) == 0)
294  {
295  if (seq_name != NULL)
296  {
297  log_warning(stderr, "SAM file warning (line %d): multiple SN tags in @SQ record\n", numLines);
298  } else {
299  seq_name = &start[3];
300  }
301  } else if (strncmp(start, "LN:", strlen("LN:")) == 0) {
302  if (seq_len != NULL)
303  {
304  log_warning(stderr, "SAM file warning (line %d): multiple LN tags in @SQ record\n", numLines);
305  } else {
306  seq_len = &start[3];
307  }
308  }
309 
310  if (!delim)
311  {
312  // this was the last token
313  break;
314  }
315 
316  // advance to next token
317  start = delim + 1;
318  }
319 
320  if (seq_name == NULL || seq_len == NULL)
321  {
322  log_warning(stderr, "SAM file warning (line %d): missing required tags in @SQ record\n", numLines);
323  return true;
324  }
325 
326  char *endptr = NULL;
327 #if WIN32
328  uint64 len = strtol(seq_len, &endptr, 10);
329 #else
330  uint64 len = strtoll(seq_len, &endptr, 10);
331 #endif
332  if (!endptr || endptr == seq_len || *endptr != '\0')
333  {
334  log_warning(stderr, "SAM file warning (line %d): invalid sequence length in @SQ record\n", numLines);
335  }
336 
337  sq_names.push_back(std::string(seq_name));
338  sq_lengths.push_back(len);
339 
340  return true;
341 }
342 
343 // rewind
344 //
346 {
347  if (fp == NULL)
348  return false;
349 
350  gzrewind( fp );
351 
352  linebuf = NULL;
353  linebuf_size = 0;
354  line_length = 0;
355  numLines = 0;
356  version = NULL;
357  sortOrder = SortOrder_unknown;
358 
360  return init();
361 }
362 
363 // fetch the next chunk of reads (up to max_reads) from the file and push it into output
365 {
366  if (max_bps < SequenceDataFile::LONG_READ)
367  return 0;
368 
369  char *name;
370  char *flag;
371  char *rname;
372  char *pos;
373  char *mapq;
374  char *cigar;
375  char *rnext;
376  char *pnext;
377  char *tlen;
378  char *seq;
379  char *qual;
380 
381  uint32 read_flags;
382 
383  // find the next primary alignment from the file
384  do {
385  // get next line from file
386  if (readLine() == false)
387  {
388  return 0;
389  }
390 
391 // ugly macro to tokenize the string based on strchr
392 #define NEXT(prev, next) \
393  { \
394  next = strchr(prev, '\t'); \
395  if (!next) { \
396  log_error(stderr, "Error parsing SAM file (line %d): incomplete alignment section\n", numLines); \
397  m_file_state = FILE_PARSE_ERROR; \
398  return -1; \
399  } \
400  *next = '\0'; \
401  next++; \
402  }
403 
404  // first token is just the start of the string
405  name = linebuf;
406 
407  // for all remaining tokens, locate the next token based on the previous
408  NEXT(name, flag);
409  NEXT(flag, rname);
410  NEXT(rname, pos);
411  NEXT(pos, mapq);
412  NEXT(mapq, cigar);
413  NEXT(cigar, rnext);
414  NEXT(rnext, pnext);
415  NEXT(pnext, tlen);
416  NEXT(tlen, seq);
417  NEXT(seq, qual);
418 
419 #undef NEXT
420 
421  // figure out what the flag value is
422  read_flags = strtol(flag, NULL, 0);
423  } while(read_flags & SAMFlag_SecondaryAlignment);
424 
425  if (m_options.flags & FORWARD)
426  {
429 
430  // add the read
431  output->push_back(uint32(strlen(seq)),
432  name,
433  (uint8*)seq,
434  (uint8*)qual,
435  Phred33,
439  op );
440  }
441  if (m_options.flags & REVERSE)
442  {
445 
446  // add the read
447  output->push_back(uint32(strlen(seq)),
448  name,
449  (uint8*)seq,
450  (uint8*)qual,
451  Phred33,
455  op );
456  }
458  {
461 
462  // add the read
463  output->push_back(uint32(strlen(seq)),
464  name,
465  (uint8*)seq,
466  (uint8*)qual,
467  Phred33,
471  op );
472  }
474  {
477 
478  // add the read
479  output->push_back(uint32(strlen(seq)),
480  name,
481  (uint8*)seq,
482  (uint8*)qual,
483  Phred33,
487  op );
488  }
489 
490  // we always input 1 read at a time here
491  return 1;
492 }
493 
494 } // namespace io
495 } // namespace nvbio