NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
sequence_pac.cpp
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
30 #include <nvbio/basic/bnt.h>
31 #include <nvbio/basic/console.h>
32 #include <stdio.h>
33 #include <stdlib.h>
34 
35 namespace nvbio {
36 namespace io {
37 
38 namespace { // anonymous namespace
39 
40 template <typename T>
41 uint64 block_fread(T* dst, const uint64 n, FILE* file)
42 {
43 #if defined(WIN32)
44  // use blocked reads on Windows, which seems to otherwise become less responsive while reading.
45  const uint64 BATCH_SIZE = 16*1024*1024;
46  for (uint64 batch_begin = 0; batch_begin < n; batch_begin += BATCH_SIZE)
47  {
48  const uint64 batch_end = nvbio::min( batch_begin + BATCH_SIZE, n );
49  const uint64 batch_size = batch_end - batch_begin;
50 
51  const uint64 n_words = fread( dst + batch_begin, sizeof(T), batch_size, file );
52  if (n_words != batch_size)
53  return batch_begin + n_words;
54  }
55  return n;
56 #else
57  return fread( dst, sizeof(T), n, file );
58 #endif
59 }
60 
61 template <Alphabet ALPHABET>
62 bool load_pac(
63  const char* prefix,
64  uint32* stream,
65  uint32 seq_length,
66  uint32 seq_words)
67 {
68  std::string wpac_file_name = std::string( prefix ) + ".wpac";
69  std::string pac_file_name = std::string( prefix ) + ".pac";
70  const char* file_name = wpac_file_name.c_str();
71  bool wpac = true;
72 
73  FILE* file = fopen( wpac_file_name.c_str(), "rb" );
74  if (file == NULL)
75  {
76  file = fopen( pac_file_name.c_str(), "rb" );
77  file_name = pac_file_name.c_str();
78  wpac = false;
79  }
80 
81  if (file == NULL)
82  {
83  log_warning(stderr, "unable to open %s.[w]pac\n", prefix);
84  return false;
85  }
86 
87  typedef SequenceDataTraits<ALPHABET> sequence_traits;
88 
89  typedef PackedStream<
90  uint32*,uint8,
91  sequence_traits::SEQUENCE_BITS,
92  sequence_traits::SEQUENCE_BIG_ENDIAN> output_stream_type;
93 
94  if (wpac)
95  {
96  // read a .wpac file
97  uint64 field;
98  if (!fread( &field, sizeof(field), 1, file ))
99  {
100  log_error(stderr, "failed reading %s\n", file_name);
101  return false;
102  }
103 
104  const uint32 _seq_length = uint32(field);
105  if (_seq_length != seq_length)
106  {
107  log_error(stderr, "mismatching sequence lengths in %s, expected: %u, found: %u\n", file_name, seq_length, _seq_length);
108  return false;
109  }
110 
111  if (ALPHABET == DNA && sequence_traits::SEQUENCE_BIG_ENDIAN == true)
112  {
113  // read the 2-bit per symbol words in the final destination
114  const uint32 n_words = (uint32)block_fread( stream, seq_words, file );
115  if (n_words != seq_words)
116  {
117  log_error(stderr, "failed reading %s\n", file_name);
118  return false;
119  }
120  }
121  else
122  {
123  // read the 2-bit per symbol words in a temporary array
124  const uint32 pac_words = uint32( util::divide_ri( seq_length, 16u ) );
125 
126  std::vector<uint32> pac_vec( pac_words );
127  uint32* pac_stream = &pac_vec[0];
128 
129  const uint32 n_words = (uint32)block_fread( pac_stream, pac_words, file );
130  if (n_words != pac_words)
131  {
132  log_error(stderr, "failed reading %s\n", file_name);
133  return false;
134  }
135 
136  // build the input wpac stream
137  typedef PackedStream<const uint32*,uint8,2,true> pac_stream_type;
138  pac_stream_type pac( pac_stream );
139 
140  // build the output stream
141  output_stream_type out( stream );
142 
143  // copy the pac stream into the output
144  assign( seq_length, pac, out ); // TODO: transform pac using a DNA -> ALPHABET conversion
145  }
146  }
147  else
148  {
149  // read a .pac file
150  fseek( file, -1, SEEK_END );
151  const uint32 packed_file_len = ftell( file );
152  uint8 last_byte_len;
153  if (!fread( &last_byte_len, sizeof(unsigned char), 1, file ))
154  {
155  log_error(stderr, "failed reading %s\n", file_name);
156  return false;
157  }
158  const uint32 _seq_length = (packed_file_len - 1u) * 4u + last_byte_len;
159  if (_seq_length != seq_length)
160  {
161  log_error(stderr, "mismatching sequence lengths in %s, expected: %u, found: %u\n", file_name, seq_length, _seq_length);
162  return false;
163  }
164 
165  fseek( file, 0, SEEK_SET );
166 
167  const uint32 seq_bytes = uint32( util::divide_ri( seq_length, 4u ) );
168 
169  std::vector<uint8> pac_vec( seq_bytes );
170  uint8* pac_stream = &pac_vec[0];
171 
172  const uint64 n_bytes = block_fread( pac_stream, seq_bytes, file );
173  if (n_bytes != seq_bytes)
174  {
175  log_error(stderr, "failed reading %s\n", file_name);
176  return false;
177  }
178 
179  // build the input pac stream
180  typedef PackedStream<const uint8*,uint8,2,true> pac_stream_type;
181  pac_stream_type pac( pac_stream );
182 
183  // build the output stream
184  output_stream_type out( stream );
185 
186  // copy the pac stream into the output
187  assign( seq_length, pac, out ); // TODO: transform pac using a DNA -> ALPHABET conversion
188  }
189  fclose( file );
190  return true;
191 }
192 
193 struct BNTLoader : public nvbio::BNTSeqLoader
194 {
195  typedef nvbio::vector<host_tag,uint32> IndexVector;
196  typedef nvbio::vector<host_tag,char> StringVector;
197 
198  BNTLoader(
199  IndexVector& index_vec,
200  IndexVector& name_index_vec,
201  StringVector& name_vec) :
202  m_name_vec( &name_vec ),
203  m_name_index_vec( &name_index_vec ),
204  m_index_vec( &index_vec ) {}
205 
206  void set_info(const nvbio::BNTInfo info)
207  {
208  m_info.m_n_seqs = uint32( info.n_seqs );
209  m_info.m_sequence_stream_len = uint32( info.l_pac );
210  m_info.m_avg_sequence_len = uint32( info.l_pac / info.n_seqs );
211  m_info.m_min_sequence_len = uint32(-1);
212  m_info.m_max_sequence_len = 0u;
213  }
214  void read_ann(const nvbio::BNTAnnInfo& info, nvbio::BNTAnnData& data)
215  {
216  // keep track of the current name offset
217  const uint32 name_offset = (uint32)m_name_vec->size();
218 
219  // copy the name of this sequence into the output vector
220  m_name_vec->resize( name_offset + info.name.length() + 1u );
221  strcpy( &m_name_vec->front() + name_offset, info.name.c_str() );
222 
223  // push back the name and sequence offsets
224  m_name_index_vec->push_back( uint32( m_name_vec->size() ) );
225  m_index_vec->push_back( uint32( data.offset + data.len ) );
226 
227  // keep sequence stats
228  m_info.m_min_sequence_len = nvbio::min( m_info.m_min_sequence_len, uint32( data.len ) );
229  m_info.m_max_sequence_len = nvbio::max( m_info.m_max_sequence_len, uint32( data.len ) );
230  }
231 
232  void read_amb(const nvbio::BNTAmb& amb) {}
233 
234  SequenceDataInfo m_info;
235  StringVector* m_name_vec;
236  IndexVector* m_name_index_vec;
237  IndexVector* m_index_vec;
238 };
239 
240 } // anonymous namespace
241 
242 // check whether the file name points to a pac archive
243 //
244 bool is_pac_archive(const char* sequence_file_name)
245 {
246  const std::string ann = std::string(sequence_file_name) + ".ann";
247  const std::string pac = std::string(sequence_file_name) + ".pac";
248  const std::string wpac = std::string(sequence_file_name) + ".wpac";
249  FILE* ann_file = fopen( ann.c_str(), "rb" );
250  FILE* pac_file = fopen( pac.c_str(), "rb" );
251  FILE* wpac_file = fopen( wpac.c_str(), "rb" );
252 
253  bool ann_ok = ann_file != NULL;
254  bool seq_ok = (pac_file != NULL || wpac_file != NULL);
255 
256  if (ann_file) fclose( ann_file );
257  if (pac_file) fclose( pac_file );
258  if (wpac_file) fclose( wpac_file );
259 
260  return ann_ok && seq_ok;
261 }
262 
263 // load a sequence file
264 //
265 // \param sequence_file_name the file to open
266 // \param qualities the encoding of the qualities
267 // \param max_seqs maximum number of reads to input
268 // \param max_sequence_len maximum read length - reads will be truncated
269 // \param flags a set of flags indicating which strands to encode
270 // in the batch for each read.
271 // For example, passing FORWARD | REVERSE_COMPLEMENT
272 // will result in a stream containing BOTH the forward
273 // and reverse-complemented strands.
274 //
275 bool load_pac(
276  const Alphabet alphabet,
277  SequenceDataHost* sequence_data,
278  const char* prefix,
279  const SequenceFlags load_flags,
280  const QualityEncoding qualities)
281 {
282  // prepare the sequence index
283  sequence_data->m_sequence_index_vec.resize( 1 );
284  sequence_data->m_sequence_index_vec[0] = 0;
285 
286  // prepare the name index
287  sequence_data->m_name_index_vec.resize( 1 );
288  sequence_data->m_name_index_vec[0] = 0;
289 
290  // load the BNS files
291  SequenceDataInfo info;
292  try
293  {
294  BNTLoader loader(
295  sequence_data->m_sequence_index_vec,
296  sequence_data->m_name_index_vec,
297  sequence_data->m_name_vec );
298 
299  load_bns( &loader, prefix );
300 
301  info = loader.m_info;
302  }
303  catch (...)
304  {
305  log_error(stderr, "loading BNS files failed\n");
306  return false;
307  }
308 
309  const uint32 bits = bits_per_symbol( alphabet );
310  const uint32 symbols_per_word = 32 / bits;
311 
312  const uint32 seq_length = info.bps();
313  const uint32 seq_words = uint32( util::divide_ri( seq_length, symbols_per_word ) );
314  const uint32 aligned_seq_words = align<4>( seq_words );
315 
316  // setup all basic info
317  sequence_data->SequenceDataInfo::operator=( info );
318  sequence_data->m_alphabet = alphabet;
319  sequence_data->m_sequence_stream_words = aligned_seq_words;
320  sequence_data->m_name_stream_len = uint32( sequence_data->m_name_vec.size() );
321  sequence_data->m_has_qualities = false;
322 
323  // alloc sequence storage
324  sequence_data->m_sequence_vec.resize( sequence_data->m_sequence_stream_words );
325 
326  // initialize the alignment slack
327  for (uint32 i = seq_words; i < aligned_seq_words; ++i)
328  sequence_data->m_sequence_vec[i] = 0u;
329 
330  switch (alphabet)
331  {
332  case DNA:
333  return load_pac<DNA>( prefix, &sequence_data->m_sequence_vec[0], seq_length, seq_words );
334  break;
335  case DNA_N:
336  return load_pac<DNA_N>( prefix, &sequence_data->m_sequence_vec[0], seq_length, seq_words );
337  break;
338  case PROTEIN:
339  return load_pac<PROTEIN>( prefix, &sequence_data->m_sequence_vec[0], seq_length, seq_words );
340  break;
341 
342  default: break;
343  }
344  return false;
345 }
346 
347 // load a sequence file
348 //
349 // \param sequence_file_name the file to open
350 // \param qualities the encoding of the qualities
351 // \param max_seqs maximum number of reads to input
352 // \param max_sequence_len maximum read length - reads will be truncated
353 // \param flags a set of flags indicating which strands to encode
354 // in the batch for each read.
355 // For example, passing FORWARD | REVERSE_COMPLEMENT
356 // will result in a stream containing BOTH the forward
357 // and reverse-complemented strands.
358 //
359 bool load_pac(
360  const Alphabet alphabet,
361  SequenceDataMMAPServer* sequence_data,
362  const char* prefix,
363  const char* mapped_name,
364  const SequenceFlags load_flags,
365  const QualityEncoding qualities)
366 {
367  std::string info_name = SequenceDataMMAPServer::info_file_name( mapped_name );
368  std::string sequence_name = SequenceDataMMAPServer::sequence_file_name( mapped_name );
369  std::string sequence_index_name = SequenceDataMMAPServer::sequence_index_file_name( mapped_name );
370  std::string name_name = SequenceDataMMAPServer::name_file_name( mapped_name );
371  std::string name_index_name = SequenceDataMMAPServer::name_index_file_name( mapped_name );
372 
373  // prepare the sequence index
374  nvbio::vector<host_tag,uint32> sequence_index_vec;
375  nvbio::vector<host_tag,uint32> name_index_vec;
377 
378  sequence_index_vec.resize( 1 );
379  sequence_index_vec[0] = 0;
380 
381  // prepare the name index
382  name_index_vec.resize( 1 );
383  name_index_vec[0] = 0;
384 
385  // load the BNS files
386  SequenceDataInfo info;
387  try
388  {
389  BNTLoader loader( sequence_index_vec, name_index_vec, name_vec );
390  load_bns( &loader, prefix );
391 
392  info = loader.m_info;
393  }
394  catch (...)
395  {
396  log_error(stderr, "loading BNS files failed\n");
397  return false;
398  }
399 
400  const uint32 bits = bits_per_symbol( alphabet );
401  const uint32 symbols_per_word = 32 / bits;
402 
403  const uint32 seq_length = info.bps();
404  const uint32 seq_words = uint32( util::divide_ri( seq_length, symbols_per_word ) );
405  const uint32 aligned_seq_words = align<4>( seq_words );
406 
407  // setup all basic info
408  info.m_alphabet = alphabet;
409  info.m_sequence_stream_words = aligned_seq_words;
410  info.m_name_stream_len = uint32( name_vec.size() );
411  info.m_has_qualities = false;
412 
413  try
414  {
415  // alloc sequence storage
416  uint32* sequence_ptr = (uint32*)sequence_data->m_sequence_file.init(
417  sequence_name.c_str(),
418  aligned_seq_words * sizeof(uint32),
419  NULL );
420 
421  // initialize the alignment slack
422  for (uint32 i = seq_words; i < aligned_seq_words; ++i)
423  sequence_ptr[i] = 0u;
424 
425  // alloc sequence_index storage
426  uint32* sequence_index_ptr = (uint32*)sequence_data->m_sequence_index_file.init(
427  sequence_index_name.c_str(),
428  sequence_index_vec.size() * sizeof(uint32),
429  NULL );
430 
431  // alloc name_index storage
432  uint32* name_index_ptr = (uint32*)sequence_data->m_name_index_file.init(
433  name_index_name.c_str(),
434  name_index_vec.size() * sizeof(uint32),
435  NULL );
436 
437  // alloc name storage
438  char* name_ptr = (char*)sequence_data->m_name_file.init(
439  name_name.c_str(),
440  name_vec.size() * sizeof(char),
441  NULL );
442 
443  // alloc info storage
444  SequenceDataInfo* info_ptr = (SequenceDataInfo*)sequence_data->m_info_file.init(
445  info_name.c_str(),
446  sizeof(SequenceDataInfo),
447  NULL );
448 
449  // copy the loaded index and names vectors
450  memcpy( sequence_index_ptr, &sequence_index_vec[0], sequence_index_vec.size() * sizeof(uint32) );
451  memcpy( name_index_ptr, &name_index_vec[0], name_index_vec.size() * sizeof(uint32) );
452  memcpy( name_ptr, &name_vec[0], name_vec.size() * sizeof(char) );
453 
454  *info_ptr = info;
455 
456  // load the actual sequence
457  switch (alphabet)
458  {
459  case DNA:
460  return load_pac<DNA>( prefix, sequence_ptr, seq_length, seq_words );
461  break;
462  case DNA_N:
463  return load_pac<DNA_N>( prefix, sequence_ptr, seq_length, seq_words );
464  break;
465  case RNA:
466  return load_pac<RNA>( prefix, sequence_ptr, seq_length, seq_words );
467  break;
468  case RNA_N:
469  return load_pac<RNA_N>( prefix, sequence_ptr, seq_length, seq_words );
470  break;
471  case PROTEIN:
472  return load_pac<PROTEIN>( prefix, sequence_ptr, seq_length, seq_words );
473  break;
474 
475  default: break;
476  }
477  }
479  {
480  log_error(stderr, " mapping error while mapping file: %s (code: %u)\n", e.m_file_name, e.m_code );
481  }
483  {
484  log_error(stderr, " view error while mapping file: %s (code: %u)\n", e.m_file_name, e.m_code );
485  }
486  return false;
487 }
488 
491 
492 } // namespace io
493 } // namespace nvbio