NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
sequence_fasta.cpp
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
30 #include <nvbio/basic/types.h>
31 #include <nvbio/basic/timer.h>
32 
33 #include <string.h>
34 #include <ctype.h>
35 
36 namespace nvbio {
37 namespace io {
38 
41 
44 
47 
48 namespace { // anonymous namespace
49 
50 struct FASTAHandler
51 {
52  FASTAHandler(
53  SequenceDataEncoder* output,
54  const SequenceDataFile::Options& options) :
55  m_output( output ),
56  m_options( options ) {}
57 
58  void push_back(const char* id, const uint32 read_len, const uint8* bp)
59  {
60  if (m_quals.size() < size_t( read_len ))
61  m_quals.resize( read_len, 50u );
62 
63  if (m_options.flags & FORWARD)
64  {
65  m_output->push_back(
66  read_len,
67  id,
68  bp,
69  &m_quals[0],
70  m_options.qualities,
71  m_options.max_sequence_len,
72  m_options.trim3,
73  m_options.trim5,
75  }
76  if (m_options.flags & REVERSE)
77  {
78  m_output->push_back(
79  read_len,
80  id,
81  bp,
82  &m_quals[0],
83  m_options.qualities,
84  m_options.max_sequence_len,
85  m_options.trim3,
86  m_options.trim5,
88  }
89  if (m_options.flags & FORWARD_COMPLEMENT)
90  {
91  m_output->push_back(
92  read_len,
93  id,
94  bp,
95  &m_quals[0],
96  m_options.qualities,
97  m_options.max_sequence_len,
98  m_options.trim3,
99  m_options.trim5,
101  }
102  if (m_options.flags & REVERSE_COMPLEMENT)
103  {
104  m_output->push_back(
105  read_len,
106  id,
107  bp,
108  &m_quals[0],
109  m_options.qualities,
110  m_options.max_sequence_len,
111  m_options.trim3,
112  m_options.trim5,
114  }
115  }
116 
117  SequenceDataEncoder* m_output;
118  const SequenceDataFile::Options m_options;
119  std::vector<uint8> m_quals;
120 };
121 
122 } // anonymous namespace
123 
124 // constructor
125 //
127  const char* read_file_name,
128  const SequenceDataFile::Options& options) :
129  SequenceDataFile( options ),
130  m_fasta_reader( read_file_name )
131 {
132  if (!m_fasta_reader.valid()) {
134  } else {
136  }
137 }
138 
139 // rewind
140 //
142 {
143  if (!m_fasta_reader.valid() || (m_file_state != FILE_OK && m_file_state != FILE_EOF))
144  return false;
145 
146  m_fasta_reader.rewind();
148  return true;
149 }
150 
151 // get a chunk of reads
152 //
154 {
155  const uint32 read_mult =
156  ((m_options.flags & FORWARD) ? 1u : 0u) +
157  ((m_options.flags & REVERSE) ? 1u : 0u) +
158  ((m_options.flags & FORWARD_COMPLEMENT) ? 1u : 0u) +
159  ((m_options.flags & REVERSE_COMPLEMENT) ? 1u : 0u);
160 
161  // build a writer
162  FASTAHandler writer( output, m_options );
163 
164  return m_fasta_reader.read( max_reads / read_mult, writer );
165 }
166 
167 // constructor
168 //
170  const char* file_name,
171  const char* compressor,
172  const char* options)
173  : m_file_name(file_name)
174 {
175  m_file = open_output_file( file_name, compressor, options );
176 }
177 
178 namespace {
179 
180 template <Alphabet ALPHABET>
181 void write(
182  OutputStream* output_file,
183  const io::SequenceDataAccess<ALPHABET>& sequence_data)
184 {
186  typedef typename io::SequenceDataAccess<ALPHABET>::name_string name_string;
187 
188 
189  const uint32 buffer_len =
190  sequence_data.size() * 2 + // the number of delimiters consumed for each read (i.e. '>', '\n')
191  sequence_data.bps() + // the number of characters consumed by reads
192  sequence_data.name_stream_len(); // the number of characters consumed by names
193 
194  std::vector<char> buffer( buffer_len );
195 
196  #pragma omp parallel for
197  for (int32 i = 0; i < (int32)sequence_data.size(); ++i)
198  {
199  const sequence_string read = sequence_data.get_read( i );
200  const name_string name = sequence_data.get_name( i );
201 
202  uint32 buffer_offset = i*2 + sequence_data.get_range( i ).x + sequence_data.name_index()[ i ];
203 
204  buffer[ buffer_offset++ ] = '@';
205 
206  for (uint32 j = 0; j < name.size(); ++j)
207  buffer[ buffer_offset++ ] = name[j];
208 
209  // setup the ASCII read
210  buffer[ buffer_offset++ ] = '\n';
211 
212  to_string<ALPHABET>( read.begin(), read.size(), &buffer[0] + buffer_offset );
213 
214  buffer_offset += read.size();
215 
216  buffer[ buffer_offset++ ] = '\n';
217  }
218 
219  if (output_file->write( buffer_len, &buffer[0] ) == 0)
220  throw runtime_error( "failed writing FASTQ output file" );
221 }
222 
223 } // anonymous namespace
224 
225 // next batch
226 //
228 {
229  if (sequence_data.alphabet() == DNA)
230  write( m_file, io::SequenceDataAccess<DNA>( sequence_data ) );
231  else if (sequence_data.alphabet() == DNA_N)
232  write( m_file, io::SequenceDataAccess<DNA_N>( sequence_data ) );
233  else if (sequence_data.alphabet() == PROTEIN)
234  write( m_file, io::SequenceDataAccess<PROTEIN>( sequence_data ) );
235  else if (sequence_data.alphabet() == RNA)
236  write( m_file, io::SequenceDataAccess<RNA>( sequence_data ) );
237  else if (sequence_data.alphabet() == RNA_N)
238  write( m_file, io::SequenceDataAccess<RNA_N>( sequence_data ) );
239  else if (sequence_data.alphabet() == ASCII)
240  write( m_file, io::SequenceDataAccess<ASCII>( sequence_data ) );
241 }
242 
243 // return whether the stream is ok
244 //
245 bool SequenceDataOutputFile_FASTA::is_ok() { return m_file && m_file->is_valid(); }
246 
250 
251 } // namespace io
252 } // namespace nvbio