NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
sequence_priv.cpp
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
30 
37 
39 
40 namespace nvbio {
41 namespace io {
42 
43 // grab the next batch of reads into a host memory buffer
44 int SequenceDataFile::next(SequenceDataEncoder* encoder, const uint32 batch_size, const uint32 batch_bps)
45 {
46  const uint32 reads_to_load = std::min(m_options.max_seqs - m_loaded, batch_size);
47 
48  if (!is_ok() || reads_to_load == 0)
49  return 0;
50 
51  // a default average read length used to reserve enough space
52  const uint32 AVG_READ_LENGTH = 100;
53 
54  encoder->begin_batch();
55  encoder->reserve(
56  batch_size,
57  batch_bps == uint32(-1) ? batch_size * AVG_READ_LENGTH : batch_bps ); // try to use a default read length
58 
59  // fetch the sequence info
60  const SequenceDataInfo* info = encoder->info();
61 
62  while (info->size() < reads_to_load &&
63  info->bps() < batch_bps)
64  {
65  // load 100 at a time if possible
66  const uint32 chunk_reads = nvbio::min(reads_to_load - info->size(), uint32(100));
67  const uint32 chunk_bps = batch_bps - info->bps();
68 
69  const int n = nextChunk( encoder , chunk_reads, chunk_bps );
70  assert(n <= (int) chunk_reads);
71  if (n == 0)
72  break;
73 
74  assert(info->size() <= reads_to_load);
75  }
76 
77  m_loaded += info->size();
78 
79  encoder->end_batch();
80 
81  return info->size();
82 }
83 
84 // factory method to open a read file, tries to detect file type based on file name
86  const char * sequence_file_name,
87  const QualityEncoding qualities,
88  const uint32 max_seqs,
89  const uint32 max_sequence_len,
90  const SequenceEncoding flags,
91  const uint32 trim3,
92  const uint32 trim5)
93 {
94  // parse out file extension; look for .fastq.gz, .fastq suffixes
95  uint32 len = uint32( strlen(sequence_file_name) );
96  bool is_gzipped = false;
97 
99  options.qualities = qualities;
100  options.max_seqs = max_seqs;
101  options.max_sequence_len = max_sequence_len;
102  options.flags = flags;
103  options.trim3 = trim3;
104  options.trim5 = trim5;
105 
106  // do we have a .gz suffix?
107  if (len >= strlen(".gz"))
108  {
109  if (strcmp(&sequence_file_name[len - strlen(".gz")], ".gz") == 0)
110  {
111  is_gzipped = true;
112  len = uint32(len - strlen(".gz"));
113  }
114  }
115 
116  // check for fasta suffix
117  if (len >= strlen(".fasta"))
118  {
119  if (strncmp(&sequence_file_name[len - strlen(".fasta")], ".fasta", strlen(".fasta")) == 0)
120  {
121  return new SequenceDataFile_FASTA_gz(
122  sequence_file_name,
123  options );
124  }
125  }
126  // check for fastq suffix
127  if (len >= strlen(".fa"))
128  {
129  if (strncmp(&sequence_file_name[len - strlen(".fa")], ".fa", strlen(".fa")) == 0)
130  {
131  return new SequenceDataFile_FASTA_gz(
132  sequence_file_name,
133  options );
134  }
135  }
136 
137  // check for fastq suffix
138  if (len >= strlen(".fastq"))
139  {
140  if (strncmp(&sequence_file_name[len - strlen(".fastq")], ".fastq", strlen(".fastq")) == 0)
141  {
142  return new SequenceDataFile_FASTQ_gz(
143  sequence_file_name,
144  options );
145  }
146  }
147 
148  // check for fastq suffix
149  if (len >= strlen(".fq"))
150  {
151  if (strncmp(&sequence_file_name[len - strlen(".fq")], ".fq", strlen(".fq")) == 0)
152  {
153  return new SequenceDataFile_FASTQ_gz(
154  sequence_file_name,
155  options );
156  }
157  }
158 
159  // check for txt suffix
160  if (len >= strlen(".txt"))
161  {
162  if (strncmp(&sequence_file_name[len - strlen(".txt")], ".txt", strlen(".txt")) == 0)
163  {
164  return new SequenceDataFile_TXT_gz(
165  sequence_file_name,
166  options );
167  }
168  }
169 
170  // check for sam suffix
171  if (len >= strlen(".sam"))
172  {
173  if (strncmp(&sequence_file_name[len - strlen(".sam")], ".sam", strlen(".sam")) == 0)
174  {
176 
177  ret = new SequenceDataFile_SAM(
178  sequence_file_name,
179  options );
180 
181  if (ret->init() == false)
182  {
183  delete ret;
184  return NULL;
185  }
186 
187  return ret;
188  }
189  }
190 
191  // check for bam suffix
192  if (len >= strlen(".bam"))
193  {
194  if (strncmp(&sequence_file_name[len - strlen(".bam")], ".bam", strlen(".bam")) == 0)
195  {
197 
198  ret = new SequenceDataFile_BAM(
199  sequence_file_name,
200  options );
201 
202  if (ret->init() == false)
203  {
204  delete ret;
205  return NULL;
206  }
207 
208  return ret;
209  }
210  }
211 
212  // we don't actually know what this is; guess fastq
213  log_warning(stderr, "could not determine file type for %s; guessing %sfastq\n", sequence_file_name, is_gzipped ? "compressed " : "");
214  return new SequenceDataFile_FASTQ_gz(
215  sequence_file_name,
216  options );
217 }
218 
219 // load a sequence file
220 //
221 // \param alphabet the alphabet used to encode the sequence data
222 // \param sequence_data the output sequence data
223 // \param sequence_file_name the file to open
224 // \param load_flags a set of flags indicating what to load
225 // \param qualities the encoding of the qualities
226 //
228  const Alphabet alphabet,
229  SequenceDataHost* sequence_data,
230  const char* sequence_file_name,
231  const SequenceFlags load_flags,
232  const QualityEncoding qualities)
233 {
234  // check whether this is a pac archive
235  if (is_pac_archive( sequence_file_name ))
236  return load_pac( alphabet, sequence_data, sequence_file_name, load_flags, qualities );
237 
238  // open a regular stream
239  SharedPointer<SequenceDataStream> sequence_file( open_sequence_file( sequence_file_name, qualities ) );
240  if (sequence_file == NULL || sequence_file->is_ok() == false)
241  return false;
242 
243  // load as many sequences as possible in one go
244  return io::next( alphabet, sequence_data, sequence_file.get(), uint32(-1), uint32(-1) ) > 0;
245 }
246 
247 
256  const Alphabet alphabet,
257  const char* sequence_file_name,
258  const SequenceFlags load_flags,
259  const QualityEncoding qualities)
260 {
262  if (load_sequence_file( alphabet, ret, sequence_file_name, load_flags, qualities ) == false)
263  {
264  delete ret;
265  return NULL;
266  }
267  return ret;
268 }
269 
270 //\relates SequenceDataOutputStream
271 // factory method to open a read file for writing
272 //
273 // \param sequence_file_name the file to open
274 // \param compression compression options
275 //
277  const char* sequence_file_name,
278  const char* options)
279 {
280  // parse out file extension; look for .fastq.gz, .fastq suffixes
281  uint32 len = uint32( strlen(sequence_file_name) );
282  const char* gz = "gz";
283  const char* lz4 = "lz4";
284  const char* compressor = NULL;
285 
286  // do we have a .gz suffix?
287  if (len >= strlen(".gz"))
288  {
289  if (strcmp(&sequence_file_name[len - strlen(".gz")], ".gz") == 0)
290  {
291  compressor = gz;
292  len = uint32(len - strlen(".gz"));
293  }
294  }
295 
296  // do we have a .gz suffix?
297  if (len >= strlen(".lz4"))
298  {
299  if (strcmp(&sequence_file_name[len - strlen(".lz4")], ".lz4") == 0)
300  {
301  compressor = lz4;
302  len = uint32(len - strlen(".lz4"));
303  }
304  }
305 
306  // check for fastq suffix
307  if (len >= strlen(".fastq"))
308  {
309  if (strncmp(&sequence_file_name[len - strlen(".fastq")], ".fastq", strlen(".fastq")) == 0)
310  {
311  return new SequenceDataOutputFile_FASTQ(
312  sequence_file_name,
313  compressor,
314  options );
315  }
316  }
317 
318  // check for fastq suffix
319  if (len >= strlen(".fq"))
320  {
321  if (strncmp(&sequence_file_name[len - strlen(".fq")], ".fq", strlen(".fq")) == 0)
322  {
323  return new SequenceDataOutputFile_FASTQ(
324  sequence_file_name,
325  compressor,
326  options );
327  }
328  }
329 
330  // check for fastq suffix
331  if (len >= strlen(".fasta"))
332  {
333  if (strncmp(&sequence_file_name[len - strlen(".fasta")], ".fasta", strlen(".fasta")) == 0)
334  {
335  return new SequenceDataOutputFile_FASTA(
336  sequence_file_name,
337  compressor,
338  options );
339  }
340  }
341 
342  // check for fastq suffix
343  if (len >= strlen(".fa"))
344  {
345  if (strncmp(&sequence_file_name[len - strlen(".fa")], ".fa", strlen(".fa")) == 0)
346  {
347  return new SequenceDataOutputFile_FASTA(
348  sequence_file_name,
349  compressor,
350  options );
351  }
352  }
353 
354  // check for fastq suffix
355  if (len >= strlen(".txt"))
356  {
357  if (strncmp(&sequence_file_name[len - strlen(".txt")], ".txt", strlen(".txt")) == 0)
358  {
359  return new SequenceDataOutputFile_TXT(
360  sequence_file_name,
361  compressor,
362  options );
363  }
364  }
365 
366  // we don't actually know what this is; guess fastq
367  log_warning(stderr, "could not determine file type for %s; guessing fastq\n", sequence_file_name);
368  return new SequenceDataOutputFile_FASTQ(
369  sequence_file_name,
370  compressor,
371  options );
372 }
373 
374 } // namespace io
375 } // namespace nvbio