NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
fastq_inl.h
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #pragma once
29 
30 namespace nvbio {
31 
32 // get the next character, or 255 if EOF
33 //
34 inline
36 {
37  if (m_buffer_pos >= m_buffer_size)
38  {
39  m_buffer_size = read( &m_buffer[0], (uint32)m_buffer.size() );
40  m_buffer_pos = 0;
41  }
42  return (m_buffer_pos < m_buffer_size) ? m_buffer[ m_buffer_pos++ ] : FASTQ_EOF;
43 };
44 
45 // constructor
46 //
47 inline
48 FASTQ_file::FASTQ_file(const uint32 buffer_size)
49  : m_file( NULL ),
50  m_buffer( buffer_size ),
51  m_buffer_size( 0 ),
52  m_buffer_pos( 0 )
53 {}
54 
55 // constructor
56 //
57 inline
58 FASTQ_file::FASTQ_file(const char* filename, const uint32 buffer_size)
59  : m_buffer( buffer_size ), m_buffer_size( 0 ), m_buffer_pos( 0 )
60 {
61  m_file = fopen( filename, "r" );
62 }
63 
64 // open a new file
65 //
66 inline
67 void FASTQ_file::open(const char* filename)
68 {
69  if (m_file)
70  fclose( m_file );
71 
72  m_file = fopen( filename, "r" );
73 }
74 
75 // destructor
76 //
77 inline
79 {
80  if (m_file)
81  fclose( m_file );
82 }
83 
84 // get the next character, or 255 if EOF
85 //
86 inline
88 {
89  if (m_buffer_pos >= m_buffer_size)
90  {
91  m_buffer_size = (uint32)fread( &m_buffer[0], sizeof(uint8), m_buffer.size(), m_file );
92  m_buffer_pos = 0;
93  }
94  return (m_buffer_pos < m_buffer_size) ? m_buffer[ m_buffer_pos++ ] : FASTQ_EOF;
95 };
96 
97 // constructor
98 //
99 template <typename FASTQ_stream>
101  m_stream( &stream ),
102  m_error(0),
103  m_line(0)
104 {}
105 
106 // read a batch of bp reads
107 //
108 template <typename FASTQ_stream>
109 template <typename Writer>
110 uint32 FASTQ_reader<FASTQ_stream>::read(const uint32 n_reads, Writer& writer)
111 {
112  uint32 n = 0;
113  uint8 marker;
114 
115  while (n < n_reads)
116  {
117  // consume spaces & newlines
118  for (marker = get();
119  marker == '\n' ||
120  marker == ' ';
121  marker = get())
122  {
123  if (marker == '\n')
124  m_line++;
125  }
126 
127  // check for EOF
128  if (marker == FASTQ_EOF)
129  break;
130 
131  // if the newlines didn't end in a read marker,
132  // issue a parsing error...
133  if (marker != '@')
134  {
135  m_error = 1;
136  m_error_char = marker;
137  return uint32(-1);
138  }
139 
140  // read all the line
141  m_name.erase( m_name.begin(), m_name.end() );
142  for (uint8 c = get(); c != '\n'; c = get())
143  {
144  if (c == FASTQ_EOF)
145  {
146  log_error(stderr, "incomplete read at EOF!\n");
147  m_name.erase(m_name.begin(), m_name.end());
148 
149  m_error = 1;
150  m_error_char = FASTQ_EOF;
151  return uint32(-1);
152  }
153 
154  m_name.push_back(c);
155  }
156 
157  m_name.push_back('\0');
158 
159  m_line++;
160 
161  // start reading the bp read
162  m_read_bp.erase( m_read_bp.begin(), m_read_bp.end() );
163  for (uint8 c = get(); c != '+'; c = get())
164  {
165  if (c == FASTQ_EOF)
166  {
167  log_error(stderr, "incomplete read at EOF!\n");
168  m_name.erase(m_name.begin(), m_name.end());
169 
170  m_error = 1;
171  m_error_char = FASTQ_EOF;
172  return uint32(-1);
173  }
174 
175  if (isgraph(c))
176  m_read_bp.push_back( c );
177 
178  if (c == '\n')
179  m_line++;
180  }
181 
182  // read all the line
183  for(uint8 c = get(); c != '\n'; c = get())
184  {
185  if (c == FASTQ_EOF)
186  {
187  log_error(stderr, "incomplete read at EOF!\n");
188  m_name.erase(m_name.begin(), m_name.end());
189  m_read_bp.erase(m_read_bp.begin(), m_read_bp.end());
190 
191  m_error = 1;
192  m_error_char = FASTQ_EOF;
193  return uint32(-1);
194  }
195 
196  }
197 
198  m_line++;
199 
200  // start reading the quality read
201  m_read_q.erase( m_read_q.begin(), m_read_q.end() );
202  for (uint8 c = get(); c != '\n'; c = get())
203  {
204  if (c == FASTQ_EOF)
205  {
206  log_error(stderr, "incomplete read at EOF!\n");
207  m_name.erase(m_name.begin(), m_name.end());
208  m_read_bp.erase(m_read_bp.begin(), m_read_bp.end());
209 
210  m_error = 1;
211  m_error_char = FASTQ_EOF;
212  return uint32(-1);
213  }
214 
215  m_read_q.push_back( c );
216  }
217 
218  m_line++;
219 
220  writer.push_back(
221  uint32( m_read_bp.size() ),
222  &m_name[0],
223  &m_read_bp[0],
224  &m_read_q[0] );
225 
226  ++n;
227  }
228  return n;
229 }
230 
231 // error string
232 //
233 template <typename FASTQ_stream>
235 {
236  if (m_error == 1)
237  sprintf(error, "line %u, expected '@', got '%c'", m_line, m_error_char);
238  else
239  sprintf(error, "line %u, expected '+', got '%c'", m_line, m_error_char);
240 }
241 
242 } // namespace nvbio