NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
fasta_inl.h
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #pragma once
29 
30 namespace nvbio {
31 
32 // constructor
33 //
34 inline
35 FASTA_inc_reader::FASTA_inc_reader(const char* filename, const uint32 buffer_size)
36  : m_buffer( buffer_size ), m_buffer_size( 0 ), m_buffer_pos( 0 )
37 {
38  m_file = gzopen( filename, "r" );
39  if (m_file)
40  gzbuffer( m_file, buffer_size );
41 }
42 // destructor
43 //
44 inline
46 {
47  if (m_file)
48  gzclose( m_file );
49 }
50 
51 // read a batch of bp reads
52 //
53 // \tparam Writer an output handler class, which must
54 // implement the following interface:
55 //
56 // \code
57 // struct Writer
58 // {
59 // // called before starting to parse a new read
60 // void begin_read();
61 //
62 // // called upon completion of a single read
63 // void end_read();
64 //
65 // // provide the next character of the read id
66 // void id(const char c);
67 //
68 // // provide the next base of the read
69 // void read(const char c);
70 // }
71 // \endcode
72 //
73 template <typename Writer>
74 uint32 FASTA_inc_reader::read(const uint32 n_reads, Writer& writer)
75 {
76  uint32 n = 0;
77  uint8 c;
78  bool read_sequence = false;
79 
80  writer.begin_read();
81 
82  while ((c = get()) != 255)
83  {
84  // start of a new sequence?
85  if (c == '>')
86  {
87  // end the previous one
88  if (read_sequence)
89  {
90  writer.end_read();
91  writer.begin_read();
92 
93  if (n == n_reads)
94  return n;
95  }
96 
97  n++;
98 
99  // read the id
100  for (c = get(); c != ' ' && c != '\n'; c = get())
101  writer.id( c );
102  writer.id( '\0' );
103 
104  // read the rest of the line
105  while (c != '\n') { c = get(); }
106 
107  read_sequence = true;
108  }
109 
110  // save non-trivial characters into the read
111  if (read_sequence && c != '\n' && c != ' ')
112  writer.read( c );
113  }
114  // output the last sequence we've been reading
115  writer.end_read();
116 
117  return n;
118 }
119 
120 // get the next character, or 255 if EOF
121 //
122 inline
124 {
125  if (m_buffer_pos >= m_buffer_size)
126  {
127  m_buffer_size = uint32( gzread( m_file, &m_buffer[0], (unsigned int)m_buffer.size() ) );
128  m_buffer_pos = 0;
129  }
130  return (m_buffer_pos < m_buffer_size) ? m_buffer[ m_buffer_pos++ ] : 255u;
131 };
132 
133 // constructor
134 //
135 inline
136 FASTA_reader::FASTA_reader(const char* filename, const uint32 buffer_size)
137  : m_buffer( buffer_size ), m_buffer_size( 0 ), m_buffer_pos( 0 )
138 {
139  m_file = gzopen( filename, "r" );
140  if (m_file)
141  gzbuffer( m_file, buffer_size );
142 }
143 
144 // destructor
145 //
146 inline
148 {
149  if (m_file)
150  gzclose( m_file );
151 }
152 
153 // rewind the file
154 //
155 inline
157 {
158  if (m_file)
159  gzrewind( m_file );
160 
161  m_buffer_size = 0;
162  m_buffer_pos = 0;
163 }
164 
165 // read a batch of bp reads
166 //
167 // \tparam Writer an output handler class, which must
168 // implement the following interface:
169 //
170 // \code
171 // struct Writer
172 // {
173 // // called whenever a new read has been parsed
174 // void push_back(
175 // const char* id,
176 // const uint32 read_length,
177 // const uint8* read);
178 // }
179 // \endcode
180 //
181 template <typename Writer>
182 uint32 FASTA_reader::read(const uint32 n_reads, Writer& writer)
183 {
184  uint32 n = 0;
185  uint8 c;
186  bool read_sequence = false;
187 
188  while ((c = get()) != 255)
189  {
190  // start of a new sequence?
191  if (c == '>')
192  {
193  // output the previous one
194  if (read_sequence)
195  {
196  writer.push_back(
197  &m_id[0],
198  uint32( m_read.size() ),
199  &m_read[0] );
200 
201  if (n == n_reads)
202  return n;
203  }
204 
205  n++;
206 
207  // read the id
208  m_id.erase( m_id.begin(), m_id.end() );
209  for (c = get(); c != ' ' && c != '\n'; c = get())
210  m_id.push_back( char(c) );
211 
212  m_id.push_back('\0');
213 
214  // read the rest of the line
215  while (c != '\n') { c = get(); }
216 
217  // reset the read
218  m_read.erase( m_read.begin(), m_read.end() );
219 
220  read_sequence = true;
221  }
222 
223  // save non-trivial characters into the read
224  if (read_sequence && c != '\n' && c != ' ')
225  m_read.push_back( c );
226  }
227  // output the last sequence we've been reading
228  if (read_sequence)
229  {
230  writer.push_back(
231  &m_id[0],
232  uint32( m_read.size() ),
233  &m_read[0] );
234  }
235  return n;
236 }
237 
238 // get the next character, or 255 if EOF
239 //
240 inline
242 {
243  if (m_buffer_pos >= m_buffer_size)
244  {
245  m_buffer_size = uint32( gzread( m_file, &m_buffer[0], (unsigned int)m_buffer.size() ) );
246  m_buffer_pos = 0;
247  }
248  return (m_buffer_pos < m_buffer_size) ? m_buffer[ m_buffer_pos++ ] : 255u;
249 }
250 
251 } // namespace nvbio