NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
sequence_txt.cpp
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
30 #include <nvbio/basic/types.h>
31 
32 #include <string.h>
33 #include <ctype.h>
34 
35 namespace nvbio {
36 namespace io {
37 
40 
43 
46 
48 {
49  const char* name = "";
50 
51  uint32 n_reads = 0;
52  uint32 n_bps = 0;
53 
54  const uint32 read_mult =
55  ((m_options.flags & FORWARD) ? 1u : 0u) +
56  ((m_options.flags & REVERSE) ? 1u : 0u) +
57  ((m_options.flags & FORWARD_COMPLEMENT) ? 1u : 0u) +
58  ((m_options.flags & REVERSE_COMPLEMENT) ? 1u : 0u);
59 
60  const uint32 max_read_len = 16*1024*1024;
61 
62  const uint8 best_quality = uint8('~'); // ASCII code 126
63 
64  if (m_read_bp.size() < max_read_len)
65  {
66  m_read_bp.resize( max_read_len );
67  m_read_q.resize( max_read_len, best_quality );
68  }
69 
70  while (n_reads + read_mult <= max_reads &&
71  n_bps + read_mult*SequenceDataFile::LONG_READ <= max_bps)
72  {
73  #define NVBIO_TXT_LINE_PARSING
74 
75  #if defined(NVBIO_TXT_LINE_PARSING)
76  uint32 read_len = 0;
77  uint32 read_off = 0;
78 
79  while (1)
80  {
81  char* read_ptr = (char*)&m_read_bp[0] + read_off;
82 
83  if (gets( read_ptr, m_read_bp.size() - read_off ) == 0)
84  {
86  break;
87  }
88 
89  // look for the new read length
90  read_len = strlen( read_ptr ) + read_off;
91 
92  // check whether we reached the end of the buffer without finding a newline
93  if (read_len == m_read_bp.size()-1 &&
94  m_read_bp[ read_len-1 ] != '\n')
95  {
96  // resize the read
97  m_read_bp.resize( m_read_bp.size() * 2 );
98 
99  // advance the buffer offset
100  read_off = read_len;
101  }
102  else
103  break;
104  }
105 
106  // drop the newline
107  if (read_len && m_read_bp[ read_len-1 ] == '\n')
108  read_len--;
109  #else
110  // reset the read
111  uint32 read_len = 0;
112 
113  // read an entire line
114  for (uint8 c = get(); c != '\n' && c != 0; c = get())
115  {
116  // if (isgraph(c))
117  if (c >= 0x21 && c <= 0x7E)
118  {
119  if (m_read_bp.size() < read_len)
120  m_read_bp.resize( read_len );
121 
122  m_read_bp[ read_len++ ] = c;
123  }
124  }
125  #endif
126 
127  ++m_line;
128 
129  if (m_read_q.size() < read_len)
130  {
131  // extend the quality score vector if needed
132  const size_t old_size = m_read_q.size();
133  m_read_q.resize( read_len );
134  for (size_t i = old_size; i < read_len; ++i)
135  m_read_q[i] = best_quality;
136  }
137 
138  if (read_len)
139  {
140  if (m_options.flags & FORWARD)
141  {
142  output->push_back(
143  read_len,
144  name,
145  &m_read_bp[0],
146  &m_read_q[0],
152  }
153  if (m_options.flags & REVERSE)
154  {
155  output->push_back(
156  read_len,
157  name,
158  &m_read_bp[0],
159  &m_read_q[0],
165  }
167  {
168  output->push_back(
169  read_len,
170  name,
171  &m_read_bp[0],
172  &m_read_q[0],
178  }
180  {
181  output->push_back(
182  read_len,
183  name,
184  &m_read_bp[0],
185  &m_read_q[0],
191  }
192 
193  n_bps += read_mult * (uint32)read_len;
194  n_reads += read_mult;
195  }
196 
197  // check for end-of-file
198  if (m_file_state != FILE_OK)
199  break;
200  }
201  return n_reads;
202 }
203 
205  const char* read_file_name,
206  const Options& options,
207  const uint32 buffer_size)
208  : SequenceDataFile_TXT(read_file_name, options, buffer_size)
209 {
210  m_file = gzopen(read_file_name, "r");
211  if (!m_file) {
213  } else {
215  }
216 
217  gzbuffer(m_file, m_buffer_size);
218 }
219 
221 {
222  gzclose( m_file );
223 }
224 
225 // rewind the file
226 //
228 {
229  if (m_file == NULL)
230  return false;
231 
232  gzrewind( m_file );
233 
235 
236  m_buffer_size = (uint32)m_buffer.size();
237  m_buffer_pos = (uint32)m_buffer.size();
238  m_line = 0;
239  return true;
240 }
241 
243 {
244  m_buffer_size = gzread(m_file, &m_buffer[0], (uint32)m_buffer.size());
245  if (m_buffer_size <= 0)
246  {
247  // check for EOF separately; zlib will not always return Z_STREAM_END at EOF below
248  if (gzeof(m_file))
249  {
250  return FILE_EOF;
251  } else {
252  // ask zlib what happened and inform the user
253  int err;
254  const char *msg;
255 
256  msg = gzerror(m_file, &err);
257  // we're making the assumption that we never see Z_STREAM_END here
258  assert(err != Z_STREAM_END);
259 
260  log_error(stderr, "error processing TXT file: zlib error %d (%s)\n", err, msg);
261  return FILE_STREAM_ERROR;
262  }
263  }
264 
265  return FILE_OK;
266 }
267 
268 // constructor
269 //
271  const char* file_name,
272  const char* compressor,
273  const char* options)
274  : m_file_name(file_name)
275 {
276  m_file = open_output_file( file_name, compressor, options );
277 }
278 
279 namespace {
280 
281 template <Alphabet ALPHABET>
282 void write(
283  OutputStream* output_file,
284  const io::SequenceDataAccess<ALPHABET>& sequence_data)
285 {
287 
288  const uint32 buffer_len = sequence_data.size() + sequence_data.bps();
289 
290  std::vector<char> buffer( buffer_len );
291 
292  #pragma omp parallel for
293  for (int32 i = 0; i < (int32)sequence_data.size(); ++i)
294  {
295  const sequence_string read = sequence_data.get_read( i );
296 
297  uint32 buffer_offset = sequence_data.get_range( i ).x + i;
298 
299  to_string<ALPHABET>( read.begin(), read.size(), &buffer[0] + buffer_offset );
300 
301  buffer[ buffer_offset + read.size() ] = '\n';
302  }
303 
304  if (output_file->write( buffer_len, &buffer[0] ) == 0)
305  throw runtime_error( "failed writing TXT output file" );
306 }
307 
308 } // anonymous namespace
309 
310 // next batch
311 //
313 {
314  if (sequence_data.alphabet() == DNA)
315  write( m_file, io::SequenceDataAccess<DNA>( sequence_data ) );
316  else if (sequence_data.alphabet() == DNA_N)
317  write( m_file, io::SequenceDataAccess<DNA_N>( sequence_data ) );
318  else if (sequence_data.alphabet() == PROTEIN)
319  write( m_file, io::SequenceDataAccess<PROTEIN>( sequence_data ) );
320  else if (sequence_data.alphabet() == RNA)
321  write( m_file, io::SequenceDataAccess<RNA>( sequence_data ) );
322  else if (sequence_data.alphabet() == RNA_N)
323  write( m_file, io::SequenceDataAccess<RNA_N>( sequence_data ) );
324  else if (sequence_data.alphabet() == ASCII)
325  write( m_file, io::SequenceDataAccess<ASCII>( sequence_data ) );
326 }
327 
328 // return whether the stream is ok
329 //
330 bool SequenceDataOutputFile_TXT::is_ok() { return m_file && m_file->is_valid(); }
331 
335 
336 } // namespace io
337 } // namespace nvbio