NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
bufferedtextfile.h
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #pragma once
29 
30 #include <nvbio/basic/types.h>
31 #include <nvbio/basic/exceptions.h>
32 
33 #include <zlib/zlib.h>
34 
35 #include <stdio.h>
36 #include <string.h>
37 #include <vector>
38 
39 namespace nvbio {
40 namespace io {
41 
44 
45 // Generic I/O class for consuming data from a text file delimited by a single record separator
46 // handles gzip-compressed files transparently
48 {
49  const char record_separator;
50 
51  gzFile fp;
52  bool eof;
53 
54  std::vector<char> buffer;
55  size_t read_ptr, valid_size;
56 
57 public:
58  BufferedTextFile(const char *fname, char record_separator = '\n', size_t buffer_size = 256 * 1024)
59  : record_separator(record_separator), eof(false), read_ptr(0), valid_size(0)
60  {
61  fp = gzopen(fname, "r");
62  if (fp == NULL)
63  {
64  throw nvbio::runtime_error("unable to open %s for reading", fname);
65  }
66 
67  buffer.resize(buffer_size + 1);
68  buffer[buffer_size] = 0;
69  };
70 
72  {
73  if (fp)
74  {
75  gzclose(fp);
76  }
77  }
78 
79  // refills buffer by reading from file
80  // preserves all unprocessed bytes (i.e., anything after read_ptr is moved to the front of buffer prior to reading)
81  bool fill_buffer(void)
82  {
83  if (eof)
84  {
85  return false;
86  }
87 
88  if (read_ptr != valid_size && read_ptr != 0)
89  {
90  memmove(&buffer[0], &buffer[read_ptr], valid_size - read_ptr);
91  }
92 
93  valid_size -= read_ptr;
94  read_ptr = 0;
95 
96  size_t bytes_read = gzread(fp, &buffer[valid_size], buffer.size() - valid_size - 1);
97  if (bytes_read == 0)
98  {
99  // end of file reached
100  eof = true;
101  return false;
102  }
103 
104  valid_size += bytes_read;
105  NVBIO_CUDA_ASSERT(valid_size < buffer.size() - 1);
106  buffer[valid_size] = 0;
107 
108  return true;
109  }
110 
111  bool buffer_full(void)
112  {
113  return (read_ptr == 0 && valid_size == buffer.size() - 1);
114  }
115 
116  bool buffer_empty(void)
117  {
118  return (valid_size == 0 || read_ptr == valid_size);
119  }
120 
121 public:
122  char *next_record(char **line_end)
123  {
124  char *start, *end;
125 
126  do
127  {
128  // search for the next record separator in memory
129  start = &buffer[read_ptr];
130  end = (char *) memchr(start, record_separator, valid_size - read_ptr);
131 
132  if (end == NULL)
133  {
134  // not found
135  if (eof)
136  {
137  // we're at the end of the file, so just return what's left in memory
138  if (buffer_empty())
139  {
140  return NULL;
141  } else {
142  NVBIO_CUDA_ASSERT(valid_size <= buffer.size());
143  end = &buffer[valid_size];
144  }
145  } else {
146  // need to read more data from the file
147  // first check if the buffer is completely full
148  if (buffer_full())
149  {
150  // resize the buffer
151  buffer.resize(buffer.size() * 2);
152  }
153 
154  // read more data from the file and try again
155  fill_buffer();
156  }
157  }
158  } while(end == NULL);
159 
160  if (line_end)
161  *line_end = end;
162 
163  read_ptr += end - start + 1;
164  return start;
165  }
166 };
167 
168 } // namespace io
169 } // namespace nvbio