NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
sequence_encoder.cpp
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
29 #include <stdio.h>
30 
31 namespace nvbio {
32 namespace io {
33 
34 namespace { // anonymous
35 
36 // converts ASCII characters for amino-acids into
37 // a 5 letter alphabet for { A, C, G, T, N }.
38 inline unsigned char nst_nt4_encode(unsigned char c)
39 {
40  static unsigned char nst_nt4_table[256] = {
41  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
42  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
43  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4,
44  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
45  4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
46  4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
47  4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
48  4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
49  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
50  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
51  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
52  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
53  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
54  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
55  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
56  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
57  };
58 
59  return nst_nt4_table[c];
60 }
61 
62 // convert a quality value in one of the supported encodings to Phred
63 template <QualityEncoding encoding>
64 inline unsigned char convert_to_phred_quality(const uint8 q)
65 {
66  // this table maps Solexa quality values to Phred scale
67  static unsigned char s_solexa_to_phred[] = {
68  0, 1, 1, 1, 1, 1, 1, 2, 2, 3,
69  3, 4, 4, 5, 5, 6, 7, 8, 9, 10,
70  10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
71  20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
72  30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
73  40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
74  50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
75  60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
76  70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
77  80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
78  90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
79  100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
80  110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
81  120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
82  130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
83  140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
84  150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
85  160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
86  170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
87  180, 181, 182, 183, 184, 185, 186, 187, 188, 189,
88  190, 191, 192, 193, 194, 195, 196, 197, 198, 199,
89  200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
90  210, 211, 212, 213, 214, 215, 216, 217, 218, 219,
91  220, 221, 222, 223, 224, 225, 226, 227, 228, 229,
92  230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
93  240, 241, 242, 243, 244, 245, 246, 247, 248, 249,
94  250, 251, 252, 253, 254, 255
95  };
96 
97  switch(encoding)
98  {
99  case Phred:
100  return q;
101 
102  case Phred33:
103  return q - 33;
104 
105  case Phred64:
106  return q - 64;
107 
108  case Solexa:
109  return s_solexa_to_phred[q];
110 
111  default:
112  break;
113  }
114 
115  // gcc is dumb
116  return q;
117 }
118 
119 } // anonymous namespace
120 
121 // a small sequence class supporting REVERSE | COMPLEMENT operations
122 //
123 template <Alphabet ALPHABET, SequenceDataEncoder::StrandOp FLAGS>
125 {
126  // constructor
127  sequence_string(const uint32 len, const uint8* sequence, const uint8* qual) : m_len(len), m_sequence(sequence), m_qual(qual) {}
128 
129  // string length
130  uint32 length() const { return m_len; }
131 
132  // indexing operator
133  uint8 operator[] (const uint32 i) const
134  {
135  const uint32 index = (FLAGS & SequenceDataEncoder::REVERSE_OP) ? m_len - i - 1u : i;
136 
137  // fetch the bp
138  const uint8 c = m_sequence[index];
139 
140  // and convert it to the proper alphabet
141  if (ALPHABET == DNA || ALPHABET == DNA_N)
142  {
143  const uint8 bp = nst_nt4_encode( c );
144 
146  return bp < 4u ? 3u - bp : 4u;
147 
148  return bp;
149  }
150  else
151  {
152  const uint8 bp = from_char<ALPHABET>( c );
153 
154  // TODO: implement complementing!
155  //if (FLAGS & SequenceDataEncoder::COMPLEMENT_OP)
156  // return ???;
157 
158  return bp;
159  }
160  }
161 
162  // quality operator
163  uint8 quality(const uint32 i) const
164  {
165  const uint32 index = (FLAGS & REVERSE) ? m_len - i - 1u : i;
166 
167  return m_qual[index];
168  }
169 
170  const uint32 m_len;
172  const uint8* m_qual;
173 };
174 
175 
176 
177 // encode a sequence according to a compile-time quality-encoding
178 //
179 template <Alphabet ALPHABET, QualityEncoding quality_encoding, typename sequence_type>
180 void encode(
181  const sequence_type sequence,
183  char* qual_stream)
184 {
185  #if 1
186  const uint32 len = sequence.length();
187 
188  // use the custom PackedStream assign() method
189  assign( len, sequence, stream );
190 
191  // naive serial implementation
192  for (uint32 i = 0; i < len; i++)
193  qual_stream[i] = convert_to_phred_quality<quality_encoding>(sequence.quality(i));
194  #else
195  // naive serial implementation
196  for (uint32 i = 0; i < sequence.length(); i++)
197  {
198  stream[i] = sequence[i];
199  qual_stream[i] = convert_to_phred_quality<quality_encoding>(sequence.quality(i));
200  }
201  #endif
202 }
203 
204 // encode a sequence according to some compile-time flags and run-time quality-encoding
205 //
206 template <Alphabet ALPHABET, typename sequence_type>
207 void encode(
208  const QualityEncoding quality_encoding,
209  const sequence_type sequence,
211  char* qual_stream)
212 {
213  switch (quality_encoding)
214  {
215  case Phred:
216  encode<ALPHABET,Phred>( sequence, stream, qual_stream );
217  break;
218  case Phred33:
219  encode<ALPHABET,Phred33>( sequence, stream, qual_stream );
220  break;
221  case Phred64:
222  encode<ALPHABET,Phred64>( sequence, stream, qual_stream );
223  break;
224  case Solexa:
225  encode<ALPHABET,Solexa>( sequence, stream, qual_stream );
226  break;
227 
228  default:
229  break;
230  }
231 }
232 
233 // encode a sequence according to some given run-time flags and quality-encoding
234 //
235 template <Alphabet ALPHABET>
236 void encode(
237  const SequenceDataEncoder::StrandOp conversion_flags,
238  const QualityEncoding quality_encoding,
239  const uint32 sequence_len,
240  const uint8* sequence,
241  const uint8* quality,
243  char* qual_stream)
244 {
245  const sequence_string<ALPHABET,SequenceDataEncoder::REVERSE_OP> r_sequence( sequence_len, sequence, quality );
246  const sequence_string<ALPHABET,SequenceDataEncoder::REVERSE_COMPLEMENT_OP> rc_sequence( sequence_len, sequence, quality );
247  const sequence_string<ALPHABET,SequenceDataEncoder::COMPLEMENT_OP> fc_sequence( sequence_len, sequence, quality );
248  const sequence_string<ALPHABET,SequenceDataEncoder::NO_OP> f_sequence( sequence_len, sequence, quality );
249 
250  if (conversion_flags & SequenceDataEncoder::REVERSE_OP)
251  {
252  if (conversion_flags & SequenceDataEncoder::COMPLEMENT_OP)
253  encode<ALPHABET>( quality_encoding, rc_sequence, stream, qual_stream );
254  else
255  encode<ALPHABET>( quality_encoding, r_sequence, stream, qual_stream );
256  }
257  else
258  {
259  if (conversion_flags & SequenceDataEncoder::COMPLEMENT_OP)
260  encode<ALPHABET>( quality_encoding, fc_sequence, stream, qual_stream );
261  else
262  encode<ALPHABET>( quality_encoding, f_sequence, stream, qual_stream );
263  }
264 }
265 
269 template <Alphabet SEQUENCE_ALPHABET>
271 {
272  // symbol size for sequences
276 
280  SequenceDataEncoder( SEQUENCE_ALPHABET ),
281  m_data( data ),
282  m_append( append ) {}
283 
286  void reserve(const uint32 n_sequences, const uint32 n_bps)
287  {
288  if (m_append)
289  m_data->reserve( m_data->size() + n_sequences, m_data->bps() + n_bps );
290  else
291  m_data->reserve( n_sequences, n_bps );
292  }
293 
296  void begin_batch(void)
297  {
298  if (m_append == false)
299  {
300  // reset the batch
301  m_data->SequenceDataInfo::operator=( SequenceDataInfo() );
302 
303  //m_data->m_sequence_vec.resize( 0 );
304  //m_data->m_qual_vec.resize( 0 );
305  //m_data->m_name_vec.resize( 0 );
306 
307  if (m_data->m_sequence_index_vec.size() < 1)
308  m_data->m_sequence_index_vec.resize( 1 );
309  m_data->m_sequence_index_vec[0] = 0;
310 
311  if (m_data->m_name_index_vec.size() < 1)
312  m_data->m_name_index_vec.resize( 1 );
313  m_data->m_name_index_vec[0] = 0;
314  }
315 
316  // assign the alphabet
317  m_data->m_alphabet = SEQUENCE_ALPHABET;
318  m_data->m_has_qualities = true;
319  }
320 
331  void push_back(
332  const uint32 in_sequence_len,
333  const char* name,
334  const uint8* base_pairs,
335  const uint8* quality,
336  const QualityEncoding quality_encoding,
337  const uint32 max_sequence_len,
338  const uint32 trim3,
339  const uint32 trim5,
340  const StrandOp conversion_flags)
341  {
342  const uint32 trimmed_len = in_sequence_len > trim3 + trim5 ?
343  in_sequence_len - trim3 - trim5 : 0u;
344 
345  // truncate sequence
346  const uint32 sequence_len = nvbio::min( trimmed_len, max_sequence_len );
347 
348  base_pairs += trim5;
349  quality += trim5;
350 
351  assert(sequence_len);
352 
353  // resize the sequences & quality buffers
354  {
355  static const uint32 bps_per_word = 32u / SEQUENCE_BITS;
356  const uint32 stream_len = m_data->m_sequence_stream_len + sequence_len;
357  const uint32 words = util::divide_ri( stream_len, bps_per_word );
358 
359  if (m_data->m_sequence_vec.size() < words)
360  m_data->m_sequence_vec.resize( words*2 );
361  if (m_data->m_qual_vec.size() < stream_len)
362  m_data->m_qual_vec.resize( stream_len*2 );
363 
364  m_data->m_sequence_stream_words = words;
365  }
366 
367  // encode the sequence data
369  encode<SEQUENCE_ALPHABET>(
370  conversion_flags,
371  quality_encoding,
372  sequence_len,
373  base_pairs,
374  quality,
375  stream + m_data->m_sequence_stream_len,
376  nvbio::raw_pointer( m_data->m_qual_vec ) + m_data->m_sequence_stream_len );
377 
378  // update sequence and bp counts
379  m_data->m_n_seqs++;
380  m_data->m_sequence_stream_len += sequence_len;
381 
382  if (m_data->m_sequence_index_vec.size() < m_data->m_n_seqs + 1u)
383  m_data->m_sequence_index_vec.resize( (m_data->m_n_seqs + 1u)*2 );
384  m_data->m_sequence_index_vec[ m_data->m_n_seqs ] = m_data->m_sequence_stream_len;
385 
386  m_data->m_min_sequence_len = nvbio::min( m_data->m_min_sequence_len, sequence_len );
387  m_data->m_max_sequence_len = nvbio::max( m_data->m_max_sequence_len, sequence_len );
388 
389  // store the sequence name
390  const uint32 name_len = uint32(strlen(name));
391  const uint32 name_offset = m_data->m_name_stream_len;
392 
393  if (m_data->m_name_vec.size() < name_offset + name_len + 1)
394  m_data->m_name_vec.resize( (name_offset + name_len + 1)*2 );
395  memcpy( nvbio::raw_pointer( m_data->m_name_vec ) + name_offset, name, name_len + 1 );
396 
397  m_data->m_name_stream_len += name_len + 1;
398 
399  if (m_data->m_name_index_vec.size() < m_data->m_n_seqs + 1u)
400  m_data->m_name_index_vec.resize( (m_data->m_n_seqs + 1u)*2 );
401  m_data->m_name_index_vec[ m_data->m_n_seqs ] = m_data->m_name_stream_len;
402  }
403 
406  void end_batch(void)
407  {
409 
410  m_data->m_avg_sequence_len = (uint32) ceilf(float(m_data->m_sequence_stream_len) / float(m_data->m_n_seqs));
411  }
412 
415  const SequenceDataInfo* info() const { return m_data; }
416 
417 private:
418  SequenceDataHost* m_data;
419  bool m_append;
420 };
421 
422 // create a sequence encoder
423 //
425 {
426  switch (alphabet)
427  {
428  case DNA:
429  return new SequenceDataEncoderImpl<DNA>( data );
430  break;
431  case DNA_N:
432  return new SequenceDataEncoderImpl<DNA_N>( data );
433  break;
434  case PROTEIN:
435  return new SequenceDataEncoderImpl<PROTEIN>( data );
436  break;
437  case RNA:
438  return new SequenceDataEncoderImpl<RNA>( data );
439  break;
440  case RNA_N:
441  return new SequenceDataEncoderImpl<RNA_N>( data );
442  break;
443  case ASCII:
444  return new SequenceDataEncoderImpl<ASCII>( data );
445  break;
446 
447  default:
448  break;
449  }
450  return NULL;
451 }
452 
453 // next batch
454 //
455 int next(const Alphabet alphabet, SequenceDataHost* data, SequenceDataStream* stream, const uint32 batch_size, const uint32 batch_bps)
456 {
457  switch (alphabet)
458  {
459  case DNA:
460  {
461  SequenceDataEncoderImpl<DNA> encoder( data );
462  return stream->next( &encoder, batch_size, batch_bps );
463  }
464  break;
465  case DNA_N:
466  {
467  SequenceDataEncoderImpl<DNA_N> encoder( data );
468  return stream->next( &encoder, batch_size, batch_bps );
469  }
470  break;
471  case PROTEIN:
472  {
473  SequenceDataEncoderImpl<PROTEIN> encoder( data );
474  return stream->next( &encoder, batch_size, batch_bps );
475  }
476  break;
477  case RNA:
478  {
479  SequenceDataEncoderImpl<RNA> encoder( data );
480  return stream->next( &encoder, batch_size, batch_bps );
481  }
482  break;
483  case RNA_N:
484  {
485  SequenceDataEncoderImpl<RNA_N> encoder( data );
486  return stream->next( &encoder, batch_size, batch_bps );
487  }
488  break;
489  case ASCII:
490  {
491  SequenceDataEncoderImpl<ASCII> encoder( data );
492  return stream->next( &encoder, batch_size, batch_bps );
493  }
494  break;
495 
496  default:
497  break;
498  }
499  return 0;
500 }
501 
502 // next batch
503 //
504 int append(const Alphabet alphabet, SequenceDataHost* data, SequenceDataStream* stream, const uint32 batch_size, const uint32 batch_bps)
505 {
506  switch (alphabet)
507  {
508  case DNA:
509  {
510  SequenceDataEncoderImpl<DNA> encoder( data, true );
511  return stream->next( &encoder, batch_size, batch_bps );
512  }
513  break;
514  case DNA_N:
515  {
516  SequenceDataEncoderImpl<DNA_N> encoder( data, true );
517  return stream->next( &encoder, batch_size, batch_bps );
518  }
519  break;
520  case PROTEIN:
521  {
522  SequenceDataEncoderImpl<PROTEIN> encoder( data, true );
523  return stream->next( &encoder, batch_size, batch_bps );
524  }
525  break;
526  case RNA:
527  {
528  SequenceDataEncoderImpl<DNA> encoder( data, true );
529  return stream->next( &encoder, batch_size, batch_bps );
530  }
531  break;
532  case RNA_N:
533  {
534  SequenceDataEncoderImpl<RNA_N> encoder( data, true );
535  return stream->next( &encoder, batch_size, batch_bps );
536  }
537  break;
538  case ASCII:
539  {
540  SequenceDataEncoderImpl<ASCII> encoder( data, true );
541  return stream->next( &encoder, batch_size, batch_bps );
542  }
543  break;
544 
545  default:
546  break;
547  }
548  return 0;
549 }
550 
551 // next batch
552 //
553 int skip(SequenceDataStream* stream, const uint32 batch_size, const uint32 batch_bps)
554 {
555  SequenceDataEncoder encoder( PROTEIN );
556  return stream->next( &encoder, batch_size );
557 }
558 
559 } // namespace io
560 } // namespace nvbio