NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
sam.cpp
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include <stdlib.h>
29 #include <string.h>
30 #include <zlib/zlib.h>
31 
32 #include <nvbio/basic/console.h>
33 #include <nvbio/io/reads/sam.h>
34 
35 namespace nvbio {
36 namespace io {
37 
38 ReadDataFile_SAM::ReadDataFile_SAM(const char *read_file_name,
39  const uint32 max_reads,
40  const uint32 truncate_read_len,
41  const ReadEncoding flags)
42  : ReadDataFile(max_reads, truncate_read_len, flags)
43 {
44  fp = gzopen(read_file_name, "rt");
45  if (fp == Z_NULL)
46  {
47  // this will cause init() to fail below
48  log_error(stderr, "unable to open SAM file %s\n", read_file_name);
50  } else {
52  }
53 
54  linebuf = NULL;
55  linebuf_size = 0;
56  line_length = 0;
57 
58  numLines = 0;
59 
60  version = NULL;
61  sortOrder = SortOrder_unknown;
62 }
63 
64 bool ReadDataFile_SAM::readLine(void)
65 {
66  char *ret;
67  int start_file_pos;
68 
69  // current position in linebuf that we're writing into
70  // used when resizing the buffer
71  int cur_buf_pos = 0;
72 
73  if (!linebuf)
74  {
75  // allocate initial buffer
76  linebuf_size = LINE_BUFFER_INIT_SIZE;
77  linebuf = (char *) malloc(linebuf_size);
78 
79  if (linebuf == NULL)
80  {
81  log_error(stderr, "out of memory reading SAM file\n");
83  return false;
84  }
85  }
86 
87  // stash the (uncompressed) stream position we started reading from
88  start_file_pos = gztell(fp);
89 
90  for(;;)
91  {
92  // mark the next-to-last byte with a \0
93  // this serves to detect when a buffer is too small for a full line
94  linebuf[linebuf_size - 2] = '\0';
95 
96  // try to read a full line
97  ret = gzgets(fp, &linebuf[cur_buf_pos], linebuf_size - cur_buf_pos);
98  if (ret == Z_NULL)
99  {
100  // EOF
102  return false;
103  }
104 
105  if (linebuf[linebuf_size - 2] == '\0')
106  {
107  break;
108  } else {
109  // buffer is too small, double it
110  char *tmp;
111 
112  cur_buf_pos = linebuf_size - 1;
113 
114  linebuf_size *= 2;
115  tmp = (char *) realloc(linebuf, linebuf_size);
116  if (tmp == NULL)
117  {
118  log_error(stderr, "out of memory reading SAM file\n");
120  return false;
121  }
122 
123  linebuf = tmp;
124  }
125  }
126 
127  // store the byte size of the line we read
128  line_length = gztell(fp) - start_file_pos;
129 
130  // chop off the newline character if any
131  if (linebuf[line_length - 1] == '\n')
132  {
133  assert(linebuf[line_length] == '\0');
134  linebuf[line_length - 1] = '\0';
135  }
136 
137  numLines++;
138  return true;
139 }
140 
141 void ReadDataFile_SAM::rewindLine(void)
142 {
143  assert(line_length);
144  gzseek(fp, -line_length, SEEK_CUR);
145 }
146 
147 // initializes a SAM file
148 // will consume the SAM header and prepare for fetching reads from the file
149 // returns false on failure
151 {
152  bool ret;
153 
154  if (m_file_state != FILE_OK)
155  {
156  // file failed to open
157  return false;
158  }
159 
160  // read the header section
161  do {
162  ret = readLine();
163  if (!ret)
164  {
165  return false;
166  }
167 
168  if (linebuf[0] != '@')
169  {
170  break;
171  }
172 
173  char *delim;
174  delim = strchr(linebuf, '\t');
175 
176  if (delim)
177  {
178  if (strncmp(linebuf, "@HD\t", strlen("@HD\t")) == 0)
179  {
180  ret = parseHeaderLine(delim + 1);
181  if (!ret)
182  {
183  return false;
184  }
185  } else if (strncmp(linebuf, "@SQ\t", strlen("@SQ\t")) == 0)
186  {
187  ret = parseReferenceSequenceLine(delim + 1);
188  if (!ret)
189  {
190  return false;
191  }
192  } else if (strncmp(linebuf, "@RG\t", strlen("@RG\t")) == 0) {
193  // ignored
194  continue;
195  } else if (strncmp(linebuf, "@PG\t", strlen("@PG\t")) == 0) {
196  // ignored
197  continue;
198  } else if (strncmp(linebuf, "@CO\t", strlen("@CO\t")) == 0) {
199  // ignored
200  continue;
201  } else {
202  log_warning(stderr, "SAM file warning: unknown header at line %d\n", numLines);
203  }
204  } else {
205  log_warning(stderr, "SAM file warning: malformed line %d\n", numLines);
206  }
207  } while(linebuf[0] == '@');
208 
209  // rewind the last line
210  rewindLine();
211 
212  return true;
213 }
214 
215 // parse a @HD line from the SAM file
216 // start points at the first tag of the line
217 bool ReadDataFile_SAM::parseHeaderLine(char *start)
218 {
219  char *version = NULL;
220  char *delim;
221 
222  if (numLines != 1)
223  {
224  log_warning(stderr, "SAM file warning (line %d): @HD not the first line in the header section\n", numLines);
225  }
226 
227  for(;;)
228  {
229  // look for the next delimiter
230  delim = strchr(start, '\t');
231 
232  // zero out the next delimiter if found
233  if (delim)
234  {
235  *delim = 0;
236  }
237 
238  if (strncmp(start, "VN:", strlen("VN:")) == 0)
239  {
240  version = &start[3];
241  } else if (strncmp(start, "SO:", strlen("SO:")) == 0) {
242  if(strcmp(&start[3], "unknown") == 0)
243  {
244  sortOrder = SortOrder_unknown;
245  } else if (strcmp(&start[3], "unsorted") == 0) {
246  sortOrder = SortOrder_unsorted;
247  } else if (strcmp(&start[3], "queryname") == 0) {
248  sortOrder = SortOrder_queryname;
249  } else if (strcmp(&start[3], "coordinate") == 0) {
250  sortOrder = SortOrder_coordinate;
251  } else {
252  log_warning(stderr, "SAM file warning (line %d): invalid sort order %s\n", numLines, &start[3]);
253  }
254  } else {
255  log_warning(stderr, "SAM file warning (line %d): invalid tag %s in @HD\n", numLines, start);
256  }
257 
258  if (!delim)
259  {
260  // this was the last token
261  break;
262  }
263 
264  // advance to next token
265  start = delim + 1;
266  }
267 
268  if (version == NULL)
269  {
270  log_warning(stderr, "SAM file warning (line %d): header does not contain a version tag\n", numLines);
271  }
272 
273  return true;
274 }
275 
276 // parse a @SQ line from the SAM file
277 // start points at the first tag of the line
278 bool ReadDataFile_SAM::parseReferenceSequenceLine(char *start)
279 {
280  char *seq_name = NULL;
281  char *seq_len = NULL;
282  char *delim;
283 
284  for(;;)
285  {
286  // look for the next delimiter
287  delim = strchr(start, '\t');
288 
289  // zero out the next delimiter if found
290  if (delim)
291  {
292  *delim = 0;
293  }
294 
295  if (strncmp(start, "SN:", strlen("SN:")) == 0)
296  {
297  if (seq_name != NULL)
298  {
299  log_warning(stderr, "SAM file warning (line %d): multiple SN tags in @SQ record\n", numLines);
300  } else {
301  seq_name = &start[3];
302  }
303  } else if (strncmp(start, "LN:", strlen("LN:")) == 0) {
304  if (seq_len != NULL)
305  {
306  log_warning(stderr, "SAM file warning (line %d): multiple LN tags in @SQ record\n", numLines);
307  } else {
308  seq_len = &start[3];
309  }
310  }
311 
312  if (!delim)
313  {
314  // this was the last token
315  break;
316  }
317 
318  // advance to next token
319  start = delim + 1;
320  }
321 
322  if (seq_name == NULL || seq_len == NULL)
323  {
324  log_warning(stderr, "SAM file warning (line %d): missing required tags in @SQ record\n", numLines);
325  return true;
326  }
327 
328  char *endptr = NULL;
329 #if WIN32
330  uint64 len = strtol(seq_len, &endptr, 10);
331 #else
332  uint64 len = strtoll(seq_len, &endptr, 10);
333 #endif
334  if (!endptr || endptr == seq_len || *endptr != '\0')
335  {
336  log_warning(stderr, "SAM file warning (line %d): invalid sequence length in @SQ record\n", numLines);
337  }
338 
339  sq_names.push_back(std::string(seq_name));
340  sq_lengths.push_back(len);
341 
342  return true;
343 }
344 
345 
346 // fetch the next chunk of reads (up to max_reads) from the file and push it into output
347 int ReadDataFile_SAM::nextChunk(ReadDataRAM *output, uint32 max_reads, uint32 max_bps)
348 {
349  if (max_bps < ReadDataFile::LONG_READ)
350  return 0;
351 
352  char *name;
353  char *flag;
354  char *rname;
355  char *pos;
356  char *mapq;
357  char *cigar;
358  char *rnext;
359  char *pnext;
360  char *tlen;
361  char *seq;
362  char *qual;
363 
364  uint32 read_flags;
365 
366  // find the next primary alignment from the file
367  do {
368  // get next line from file
369  if (readLine() == false)
370  {
371  return 0;
372  }
373 
374 // ugly macro to tokenize the string based on strchr
375 #define NEXT(prev, next) \
376  { \
377  next = strchr(prev, '\t'); \
378  if (!next) { \
379  log_error(stderr, "Error parsing SAM file (line %d): incomplete alignment section\n", numLines); \
380  m_file_state = FILE_PARSE_ERROR; \
381  return -1; \
382  } \
383  *next = '\0'; \
384  next++; \
385  }
386 
387  // first token is just the start of the string
388  name = linebuf;
389 
390  // for all remaining tokens, locate the next token based on the previous
391  NEXT(name, flag);
392  NEXT(flag, rname);
393  NEXT(rname, pos);
394  NEXT(pos, mapq);
395  NEXT(mapq, cigar);
396  NEXT(cigar, rnext);
397  NEXT(rnext, pnext);
398  NEXT(pnext, tlen);
399  NEXT(tlen, seq);
400  NEXT(seq, qual);
401 
402 #undef NEXT
403 
404  // figure out what the flag value is
405  read_flags = strtol(flag, NULL, 0);
406  } while(read_flags & SAMFlag_SecondaryAlignment);
407 
408  if (m_flags & FORWARD)
409  {
410  const ReadDataRAM::StrandOp op = (read_flags & SAMFlag_ReverseComplemented) ?
412 
413  // add the read
414  output->push_back(uint32(strlen(seq)),
415  name,
416  (uint8*)seq,
417  (uint8*)qual,
418  Phred33,
420  op );
421  }
422  if (m_flags & REVERSE)
423  {
424  const ReadDataRAM::StrandOp op = (read_flags & SAMFlag_ReverseComplemented) ?
426 
427  // add the read
428  output->push_back(uint32(strlen(seq)),
429  name,
430  (uint8*)seq,
431  (uint8*)qual,
432  Phred33,
434  op );
435  }
437  {
438  const ReadDataRAM::StrandOp op = (read_flags & SAMFlag_ReverseComplemented) ?
440 
441  // add the read
442  output->push_back(uint32(strlen(seq)),
443  name,
444  (uint8*)seq,
445  (uint8*)qual,
446  Phred33,
448  op );
449  }
451  {
452  const ReadDataRAM::StrandOp op = (read_flags & SAMFlag_ReverseComplemented) ?
454 
455  // add the read
456  output->push_back(uint32(strlen(seq)),
457  name,
458  (uint8*)seq,
459  (uint8*)qual,
460  Phred33,
462  op );
463  }
464 
465  // we always input 1 read at a time here
466  return 1;
467 }
468 
469 } // namespace io
470 } // namespace nvbio