45 const uint64 BATCH_SIZE = 16*1024*1024;
46 for (
uint64 batch_begin = 0; batch_begin < n; batch_begin += BATCH_SIZE)
49 const uint64 batch_size = batch_end - batch_begin;
51 const uint64 n_words = fread( dst + batch_begin,
sizeof(T), batch_size, file );
52 if (n_words != batch_size)
53 return batch_begin + n_words;
57 return fread( dst,
sizeof(T), n, file );
61 template <Alphabet ALPHABET>
68 std::string wpac_file_name = std::string( prefix ) +
".wpac";
69 std::string pac_file_name = std::string( prefix ) +
".pac";
70 const char* file_name = wpac_file_name.c_str();
73 FILE* file = fopen( wpac_file_name.c_str(),
"rb" );
76 file = fopen( pac_file_name.c_str(),
"rb" );
77 file_name = pac_file_name.c_str();
83 log_warning(stderr,
"unable to open %s.[w]pac\n", prefix);
87 typedef SequenceDataTraits<ALPHABET> sequence_traits;
91 sequence_traits::SEQUENCE_BITS,
92 sequence_traits::SEQUENCE_BIG_ENDIAN> output_stream_type;
98 if (!fread( &field,
sizeof(field), 1, file ))
100 log_error(stderr,
"failed reading %s\n", file_name);
104 const uint32 _seq_length =
uint32(field);
105 if (_seq_length != seq_length)
107 log_error(stderr,
"mismatching sequence lengths in %s, expected: %u, found: %u\n", file_name, seq_length, _seq_length);
111 if (ALPHABET ==
DNA && sequence_traits::SEQUENCE_BIG_ENDIAN ==
true)
114 const uint32 n_words = (
uint32)block_fread( stream, seq_words, file );
115 if (n_words != seq_words)
117 log_error(stderr,
"failed reading %s\n", file_name);
126 std::vector<uint32> pac_vec( pac_words );
127 uint32* pac_stream = &pac_vec[0];
129 const uint32 n_words = (
uint32)block_fread( pac_stream, pac_words, file );
130 if (n_words != pac_words)
132 log_error(stderr,
"failed reading %s\n", file_name);
137 typedef PackedStream<const uint32*,uint8,2,true> pac_stream_type;
138 pac_stream_type pac( pac_stream );
141 output_stream_type out( stream );
144 assign( seq_length, pac, out );
151 const uint32 packed_file_len = ftell( file );
153 if (!fread( &last_byte_len,
sizeof(
unsigned char), 1, file ))
155 log_error(stderr,
"failed reading %s\n", file_name);
158 const uint32 _seq_length = (packed_file_len - 1u) * 4u + last_byte_len;
159 if (_seq_length != seq_length)
161 log_error(stderr,
"mismatching sequence lengths in %s, expected: %u, found: %u\n", file_name, seq_length, _seq_length);
169 std::vector<uint8> pac_vec( seq_bytes );
170 uint8* pac_stream = &pac_vec[0];
172 const uint64 n_bytes = block_fread( pac_stream, seq_bytes, file );
173 if (n_bytes != seq_bytes)
175 log_error(stderr,
"failed reading %s\n", file_name);
180 typedef PackedStream<const uint8*,uint8,2,true> pac_stream_type;
181 pac_stream_type pac( pac_stream );
184 output_stream_type out( stream );
187 assign( seq_length, pac, out );
199 IndexVector& index_vec,
200 IndexVector& name_index_vec,
201 StringVector& name_vec) :
212 m_info.m_max_sequence_len = 0u;
246 const std::string ann = std::string(sequence_file_name) +
".ann";
247 const std::string pac = std::string(sequence_file_name) +
".pac";
248 const std::string wpac = std::string(sequence_file_name) +
".wpac";
249 FILE* ann_file = fopen( ann.c_str(),
"rb" );
250 FILE* pac_file = fopen( pac.c_str(),
"rb" );
251 FILE* wpac_file = fopen( wpac.c_str(),
"rb" );
253 bool ann_ok = ann_file != NULL;
254 bool seq_ok = (pac_file != NULL || wpac_file != NULL);
256 if (ann_file) fclose( ann_file );
257 if (pac_file) fclose( pac_file );
258 if (wpac_file) fclose( wpac_file );
260 return ann_ok && seq_ok;
301 info = loader.m_info;
305 log_error(stderr,
"loading BNS files failed\n");
312 const uint32 seq_length = info.
bps();
314 const uint32 aligned_seq_words = align<4>( seq_words );
317 sequence_data->SequenceDataInfo::operator=( info );
327 for (uint32 i = seq_words; i < aligned_seq_words; ++i)
333 return load_pac<DNA>( prefix, &sequence_data->
m_sequence_vec[0], seq_length, seq_words );
336 return load_pac<DNA_N>( prefix, &sequence_data->
m_sequence_vec[0], seq_length, seq_words );
339 return load_pac<PROTEIN>( prefix, &sequence_data->
m_sequence_vec[0], seq_length, seq_words );
363 const char* mapped_name,
378 sequence_index_vec.resize( 1 );
379 sequence_index_vec[0] = 0;
382 name_index_vec.resize( 1 );
383 name_index_vec[0] = 0;
389 BNTLoader loader( sequence_index_vec, name_index_vec, name_vec );
392 info = loader.m_info;
396 log_error(stderr,
"loading BNS files failed\n");
403 const uint32 seq_length = info.
bps();
405 const uint32 aligned_seq_words = align<4>( seq_words );
417 sequence_name.c_str(),
418 aligned_seq_words *
sizeof(
uint32),
422 for (uint32 i = seq_words; i < aligned_seq_words; ++i)
423 sequence_ptr[i] = 0u;
427 sequence_index_name.c_str(),
428 sequence_index_vec.size() *
sizeof(
uint32),
433 name_index_name.c_str(),
434 name_index_vec.size() *
sizeof(
uint32),
440 name_vec.size() *
sizeof(char),
450 memcpy( sequence_index_ptr, &sequence_index_vec[0], sequence_index_vec.size() *
sizeof(
uint32) );
451 memcpy( name_index_ptr, &name_index_vec[0], name_index_vec.size() *
sizeof(
uint32) );
452 memcpy( name_ptr, &name_vec[0], name_vec.size() *
sizeof(char) );
460 return load_pac<DNA>( prefix, sequence_ptr, seq_length, seq_words );
463 return load_pac<DNA_N>( prefix, sequence_ptr, seq_length, seq_words );
466 return load_pac<RNA>( prefix, sequence_ptr, seq_length, seq_words );
469 return load_pac<RNA_N>( prefix, sequence_ptr, seq_length, seq_words );
472 return load_pac<PROTEIN>( prefix, sequence_ptr, seq_length, seq_words );