NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
sequence_fastq.cpp
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
30 #include <nvbio/basic/types.h>
31 #include <nvbio/basic/timer.h>
32 
33 #include <string.h>
34 #include <ctype.h>
35 
36 namespace nvbio {
37 namespace io {
38 
41 
44 
47 
49 {
50  uint32 n_reads = 0;
51  uint32 n_bps = 0;
52  char marker;
53 
54  const uint32 read_mult =
55  ((m_options.flags & FORWARD) ? 1u : 0u) +
56  ((m_options.flags & REVERSE) ? 1u : 0u) +
57  ((m_options.flags & FORWARD_COMPLEMENT) ? 1u : 0u) +
58  ((m_options.flags & REVERSE_COMPLEMENT) ? 1u : 0u);
59 
60  while (n_reads + read_mult <= max_reads &&
61  n_bps + read_mult*SequenceDataFile::LONG_READ <= max_bps)
62  {
63  #if defined(NVBIO_WEAK_FASTQ_SUPPORT)
64  //
65  // This parser works only with properly formatted, modern FASTQ files which
66  // don't split reads across multiple lines and don't contain extraneous
67  // characters...
68  //
69 
70  const int MAX_LINE = 32*1024;
71  char line_buffer[MAX_LINE];
72 
73  // read the name line
74  if (gets( line_buffer, MAX_LINE ) == false)
75  {
77  break;
78  }
79  m_line++;
80 
81  strcpy( (char*)&m_name[0], &line_buffer[1] );
82 
83  // read the sequence line
84  if (gets( (char*)&m_read_bp[0], (int)m_read_bp.size() ) == false)
85  {
86  log_error(stderr, "FASTQ loader: incomplete read at line %u!\n", m_line);
88  return -1;
89  }
90  m_line++;
91 
92  const uint32 len = strlen( (const char*)&m_read_bp[0] );
93 
94  // read the "+" line
95  if (gets( line_buffer, MAX_LINE ) == false)
96  {
97  log_error(stderr, "FASTQ loader: incomplete read at line %u!\n", m_line);
99  return -1;
100  }
101  m_line++;
102 
103  // read the qualities line
104  if (gets( (char*)&m_read_q[0], (int)m_read_q.size() ) == false)
105  {
106  log_error(stderr, "FASTQ loader: incomplete read at line %u!\n", m_line);
108  return -1;
109  }
110  m_line++;
111  #else
112  // consume spaces & newlines
113  do {
114  marker = get();
115 
116  // count lines
117  if (marker == '\n')
118  m_line++;
119  }
120  while (marker >= 1 && marker <= 31);
121 
122  // check for EOF or read errors
123  if (m_file_state != FILE_OK)
124  break;
125 
126  // if the newlines didn't end in a read marker,
127  // issue a parsing error...
128  if (marker != '@')
129  {
130  log_error(stderr, "FASTQ loader: parsing error at %u!\n", m_line);
131 
133  m_error_char = marker;
134  return uint32(-1);
135  }
136 
137  // read all the line
138  uint32 len = 0;
139  for (uint8 c = get(); c != '\n' && c != 0; c = get())
140  {
141  m_name[ len++ ] = c;
142 
143  // expand on demand
144  if (m_name.size() <= len)
145  m_name.resize( len * 2u );
146  }
147 
148  m_name[ len++ ] = '\0';
149 
150  // check for errors
151  if (m_file_state != FILE_OK)
152  {
153  log_error(stderr, "FASTQ loader: incomplete read at line %u!\n", m_line);
154 
155  m_error_char = 0;
156  return uint32(-1);
157  }
158 
159  m_line++;
160 
161  // start reading the bp read
162  len = 0;
163  for (char c = get(); c != '+' && c != 0; c = get())
164  {
165  // if (isgraph(c))
166  if (c >= 0x21 && c <= 0x7E)
167  m_read_bp[ len++ ] = c;
168  else if (c == '\n')
169  m_line++;
170 
171  // expand on demand
172  if (m_read_bp.size() <= len)
173  {
174  m_read_bp.resize( len * 2u );
175  m_read_q.resize( len * 2u );
176  }
177  }
178 
179  const uint32 read_len = len;
180 
181  // check for errors
182  if (m_file_state != FILE_OK)
183  {
184  log_error(stderr, "FASTQ loader: incomplete read at line %u!\n", m_line);
185 
186  m_error_char = 0;
187  return uint32(-1);
188  }
189 
190  // read all the line
191  for (char c = get(); c != '\n' && c != 0; c = get()) {}
192 
193  // check for errors
194  if (m_file_state != FILE_OK)
195  {
196  log_error(stderr, "FASTQ loader: incomplete read at line %u!\n", m_line);
197 
198  m_error_char = 0;
199  return uint32(-1);
200  }
201 
202  m_line++;
203 
204  // start reading the quality read
205  len = 0;
206 
207  // read as many qualities as there are in the read
208  for (char c = get(); (len < read_len) && (m_file_state == FILE_OK); c = get())
209  {
210  // if (isgraph(c))
211  if (c >= 0x21 && c <= 0x7E)
212  m_read_q[ len++ ] = c;
213  else if (c == '\n')
214  m_line++;
215  }
216  /*
217  // the below works for proper FASTQ files, but not for old Sanger ones
218  // allowing strings to span multiple lines...
219  for (uint8 c = get(); c != '\n' && c != 0; c = get())
220  m_read_q[ len++ ] = c;
221 
222  if (len < read_len)
223  {
224  m_read_bp[ read_len ] = '\0';
225  m_read_q[ len ] = '\0';
226 
227  log_error(stderr, "FASTQ loader: qualities and read lengths differ at line %u!\n", m_line);
228  log_error(stderr, " name: %s\n", &m_name[0]);
229  log_error(stderr, " read: %s\n", &m_read_bp[0]);
230  log_error(stderr, " qual: %s\n", &m_read_q[0]);
231 
232  m_file_state = FILE_PARSE_ERROR;
233 
234  m_error_char = 0;
235  return uint32(-1);
236  }*/
237 
238  // check for errors
239  if (m_file_state != FILE_OK)
240  {
241  log_error(stderr, "FASTQ loader: incomplete read at line %u!\n", m_line);
242 
243  m_error_char = 0;
244  return uint32(-1);
245  }
246 
247  m_line++;
248  #endif
249 
250  if (m_options.flags & FORWARD)
251  {
252  output->push_back( len,
253  &m_name[0],
254  &m_read_bp[0],
255  &m_read_q[0],
261  }
262  if (m_options.flags & REVERSE)
263  {
264  output->push_back( len,
265  &m_name[0],
266  &m_read_bp[0],
267  &m_read_q[0],
273  }
275  {
276  output->push_back( len,
277  &m_name[0],
278  &m_read_bp[0],
279  &m_read_q[0],
285  }
287  {
288  output->push_back( len,
289  &m_name[0],
290  &m_read_bp[0],
291  &m_read_q[0],
297  }
298 
299  n_bps += read_mult * len;
300  n_reads += read_mult;
301  }
302  return n_reads;
303 }
304 
306  const char* read_file_name,
307  const SequenceDataFile::Options& options)
308  : SequenceDataFile_FASTQ_parser(read_file_name, options)
309 {
310  m_file = gzopen(read_file_name, "r");
311  if (!m_file) {
313  } else {
315  }
316 
317  gzbuffer(m_file, m_buffer_size);
318 }
319 
321 {
322  gzclose( m_file );
323 }
324 
325 //static float time = 0.0f;
326 
328 {
329  m_buffer_size = gzread(m_file, &m_buffer[0], (uint32)m_buffer.size());
330 
331  if (m_buffer_size <= 0)
332  {
333  // check for EOF separately; zlib will not always return Z_STREAM_END at EOF below
334  if (gzeof(m_file))
335  return FILE_EOF;
336  else
337  {
338  // ask zlib what happened and inform the user
339  int err;
340  const char *msg;
341  log_warning(stderr, "zlib error\n");
342 
343  msg = gzerror(m_file, &err);
344  // we're making the assumption that we never see Z_STREAM_END here
345  assert(err != Z_STREAM_END);
346 
347  log_error(stderr, "error processing FASTQ file: zlib error %d (%s)\n", err, msg);
348  return FILE_STREAM_ERROR;
349  }
350  }
351  return FILE_OK;
352 }
353 
354 // rewind
355 //
357 {
358  if (m_file == NULL || (m_file_state != FILE_OK && m_file_state != FILE_EOF))
359  return false;
360 
361  gzrewind( m_file );
362 
364 
365  m_buffer_size = (uint32)m_buffer.size();
366  m_buffer_pos = (uint32)m_buffer.size();
367  m_line = 0;
368  return true;
369 }
370 
371 // constructor
372 //
374  const char* file_name,
375  const char* compressor,
376  const char* options)
377  : m_file_name(file_name)
378 {
379  m_file = open_output_file( file_name, compressor, options );
380 }
381 
382 namespace {
383 
384 template <Alphabet ALPHABET>
385 void write(
386  OutputStream* output_file,
387  const io::SequenceDataAccess<ALPHABET>& sequence_data)
388 {
390  typedef typename io::SequenceDataAccess<ALPHABET>::qual_string qual_string;
391  typedef typename io::SequenceDataAccess<ALPHABET>::name_string name_string;
392 
393  const uint32 buffer_len =
394  sequence_data.size() * 5 + // the number of delimiters consumed for each read (i.e. '@', '\n', '+')
395  sequence_data.bps() * 2 + // the number of characters consumed by reads and qualities
396  sequence_data.name_stream_len(); // the number of characters consumed by names
397 
398  std::vector<char> buffer( buffer_len );
399 
400  #pragma omp parallel for
401  for (int32 i = 0; i < (int32)sequence_data.size(); ++i)
402  {
403  const sequence_string read = sequence_data.get_read( i );
404  const qual_string qual = sequence_data.get_quals( i );
405  const name_string name = sequence_data.get_name( i );
406 
407  uint32 buffer_offset = i*5 + sequence_data.get_range( i ).x*2 + sequence_data.name_index()[ i ];
408 
409  buffer[ buffer_offset++ ] = '@';
410 
411  for (uint32 j = 0; j < name.size(); ++j)
412  buffer[ buffer_offset++ ] = name[j];
413 
414  // setup the ASCII read
415  buffer[ buffer_offset++ ] = '\n';
416 
417  to_string<ALPHABET>( read.begin(), read.size(), &buffer[0] + buffer_offset );
418 
419  buffer_offset += read.size();
420 
421  buffer[ buffer_offset++ ] = '\n';
422  buffer[ buffer_offset++ ] = '+';
423  buffer[ buffer_offset++ ] = '\n';
424 
425  // copy the qualities to the output buffer
426  for (uint32 j = 0; j < read.size(); ++j)
427  buffer[ buffer_offset++ ] = char( qual[j] );
428 
429  buffer[ buffer_offset++ ] = '\n';
430  }
431 
432  if (output_file->write( buffer_len, &buffer[0] ) == 0)
433  throw runtime_error( "failed writing FASTQ output file" );
434 }
435 
436 } // anonymous namespace
437 
438 // next batch
439 //
441 {
442  if (sequence_data.alphabet() == DNA)
443  write( m_file, io::SequenceDataAccess<DNA>( sequence_data ) );
444  else if (sequence_data.alphabet() == DNA_N)
445  write( m_file, io::SequenceDataAccess<DNA_N>( sequence_data ) );
446  else if (sequence_data.alphabet() == PROTEIN)
447  write( m_file, io::SequenceDataAccess<PROTEIN>( sequence_data ) );
448  else if (sequence_data.alphabet() == RNA)
449  write( m_file, io::SequenceDataAccess<RNA>( sequence_data ) );
450  else if (sequence_data.alphabet() == RNA_N)
451  write( m_file, io::SequenceDataAccess<RNA_N>( sequence_data ) );
452  else if (sequence_data.alphabet() == ASCII)
453  write( m_file, io::SequenceDataAccess<ASCII>( sequence_data ) );
454 }
455 
456 // return whether the stream is ok
457 //
458 bool SequenceDataOutputFile_FASTQ::is_ok() { return m_file && m_file->is_valid(); }
459 
460 
464 
465 } // namespace io
466 } // namespace nvbio