NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
kseq.h
Go to the documentation of this file.
1 /* The MIT License
2 
3  Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor@live.co.uk>
4 
5  Permission is hereby granted, free of charge, to any person obtaining
6  a copy of this software and associated documentation files (the
7  "Software"), to deal in the Software without restriction, including
8  without limitation the rights to use, copy, modify, merge, publish,
9  distribute, sublicense, and/or sell copies of the Software, and to
10  permit persons to whom the Software is furnished to do so, subject to
11  the following conditions:
12 
13  The above copyright notice and this permission notice shall be
14  included in all copies or substantial portions of the Software.
15 
16  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20  BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21  ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22  CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23  SOFTWARE.
24 */
25 
26 /* Last Modified: 05MAR2012 */
27 
28 #ifndef AC_KSEQ_H
29 #define AC_KSEQ_H
30 
31 #include <ctype.h>
32 #include <string.h>
33 #include <stdlib.h>
34 
35 #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
36 #define KS_SEP_TAB 1 // isspace() && !' '
37 #define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows)
38 #define KS_SEP_MAX 2
39 
40 #define __KS_TYPE(type_t) \
41  typedef struct __kstream_t { \
42  int begin, end; \
43  int is_eof:2, bufsize:30; \
44  uint64_t seek_pos; \
45  type_t f; \
46  unsigned char *buf; \
47  } kstream_t;
48 
49 #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
50 #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
51 
52 #define __KS_BASIC(SCOPE, type_t, __bufsize) \
53  SCOPE kstream_t *ks_init(type_t f) \
54  { \
55  kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
56  ks->f = f; ks->bufsize = __bufsize; \
57  ks->buf = (unsigned char*)malloc(__bufsize); \
58  return ks; \
59  } \
60  SCOPE void ks_destroy(kstream_t *ks) \
61  { \
62  if (!ks) return; \
63  free(ks->buf); \
64  free(ks); \
65  }
66 
67 #define __KS_INLINED(__read) \
68  static inline int ks_getc(kstream_t *ks) \
69  { \
70  if (ks->is_eof && ks->begin >= ks->end) return -1; \
71  if (ks->begin >= ks->end) { \
72  ks->begin = 0; \
73  ks->end = __read(ks->f, ks->buf, ks->bufsize); \
74  if (ks->end < ks->bufsize) ks->is_eof = 1; \
75  if (ks->end == 0) return -1; \
76  } \
77  ks->seek_pos++; \
78  return (int)ks->buf[ks->begin++]; \
79  } \
80  static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
81  { return ks_getuntil2(ks, delimiter, str, dret, 0); }
82 
83 #ifndef KSTRING_T
84 #define KSTRING_T kstring_t
85 typedef struct __kstring_t {
86  size_t l, m;
87  char *s;
88 } kstring_t;
89 #endif
90 
91 #ifndef kroundup32
92 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
93 #endif
94 
95 #define __KS_GETUNTIL(SCOPE, __read) \
96  SCOPE int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
97  { \
98  if (dret) *dret = 0; \
99  str->l = append? str->l : 0; \
100  uint64_t seek_pos = str->l; \
101  if (ks->begin >= ks->end && ks->is_eof) return -1; \
102  for (;;) { \
103  int i; \
104  if (ks->begin >= ks->end) { \
105  if (!ks->is_eof) { \
106  ks->begin = 0; \
107  ks->end = __read(ks->f, ks->buf, ks->bufsize); \
108  if (ks->end < ks->bufsize) ks->is_eof = 1; \
109  if (ks->end == 0) break; \
110  } else break; \
111  } \
112  if (delimiter == KS_SEP_LINE) { \
113  for (i = ks->begin; i < ks->end; ++i) \
114  if (ks->buf[i] == '\n') break; \
115  } else if (delimiter > KS_SEP_MAX) { \
116  for (i = ks->begin; i < ks->end; ++i) \
117  if (ks->buf[i] == delimiter) break; \
118  } else if (delimiter == KS_SEP_SPACE) { \
119  for (i = ks->begin; i < ks->end; ++i) \
120  if (isspace(ks->buf[i])) break; \
121  } else if (delimiter == KS_SEP_TAB) { \
122  for (i = ks->begin; i < ks->end; ++i) \
123  if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
124  } else i = 0; /* never come to here! */ \
125  if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \
126  str->m = str->l + (i - ks->begin) + 1; \
127  kroundup32(str->m); \
128  str->s = (char*)realloc(str->s, str->m); \
129  } \
130  seek_pos += i - ks->begin; if ( i < ks->end ) seek_pos++; \
131  memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
132  str->l = str->l + (i - ks->begin); \
133  ks->begin = i + 1; \
134  if (i < ks->end) { \
135  if (dret) *dret = ks->buf[i]; \
136  break; \
137  } \
138  } \
139  ks->seek_pos += seek_pos; \
140  if (str->s == 0) { \
141  str->m = 1; \
142  str->s = (char*)calloc(1, 1); \
143  } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
144  str->s[str->l] = '\0'; \
145  return str->l; \
146  }
147 
148 #define KSTREAM_INIT2(SCOPE, type_t, __read, __bufsize) \
149  __KS_TYPE(type_t) \
150  __KS_BASIC(SCOPE, type_t, __bufsize) \
151  __KS_GETUNTIL(SCOPE, __read) \
152  __KS_INLINED(__read)
153 
154 #define KSTREAM_INIT(type_t, __read, __bufsize) KSTREAM_INIT2(static, type_t, __read, __bufsize)
155 
156 #define KSTREAM_DECLARE(type_t, __read) \
157  __KS_TYPE(type_t) \
158  extern int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append); \
159  extern kstream_t *ks_init(type_t f); \
160  extern void ks_destroy(kstream_t *ks); \
161  __KS_INLINED(__read)
162 
163 /******************
164  * FASTA/Q parser *
165  ******************/
166 
167 #define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
168 
169 #define __KSEQ_BASIC(SCOPE, type_t) \
170  SCOPE kseq_t *kseq_init(type_t fd) \
171  { \
172  kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \
173  s->f = ks_init(fd); \
174  return s; \
175  } \
176  SCOPE void kseq_destroy(kseq_t *ks) \
177  { \
178  if (!ks) return; \
179  free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
180  ks_destroy(ks->f); \
181  free(ks); \
182  }
183 
184 /* Return value:
185  >=0 length of the sequence (normal)
186  -1 end-of-file
187  -2 truncated quality string
188  */
189 #define __KSEQ_READ(SCOPE) \
190  SCOPE int kseq_read(kseq_t *seq) \
191  { \
192  int c; \
193  kstream_t *ks = seq->f; \
194  if (seq->last_char == 0) { /* then jump to the next header line */ \
195  while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
196  if (c == -1) return -1; /* end of file */ \
197  seq->last_char = c; \
198  } /* else: the first header char has been read in the previous call */ \
199  seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
200  if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
201  if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
202  if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
203  seq->seq.m = 256; \
204  seq->seq.s = (char*)malloc(seq->seq.m); \
205  } \
206  while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
207  if (c == '\n') continue; /* skip empty lines */ \
208  seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
209  ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
210  } \
211  if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
212  if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
213  seq->seq.m = seq->seq.l + 2; \
214  kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
215  seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
216  } \
217  seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
218  if (c != '+') return seq->seq.l; /* FASTA */ \
219  if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \
220  seq->qual.m = seq->seq.m; \
221  seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
222  } \
223  while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
224  if (c == -1) return -2; /* error: no quality string */ \
225  while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
226  seq->last_char = 0; /* we have not come to the next header line */ \
227  if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
228  return seq->seq.l; \
229  }
230 
231 #define __KSEQ_TYPE(type_t) \
232  typedef struct { \
233  kstring_t name, comment, seq, qual; \
234  int last_char; \
235  kstream_t *f; \
236  } kseq_t;
237 
238 #define KSEQ_INIT2(SCOPE, type_t, __read) \
239  KSTREAM_INIT(type_t, __read, 16384) \
240  __KSEQ_TYPE(type_t) \
241  __KSEQ_BASIC(SCOPE, type_t) \
242  __KSEQ_READ(SCOPE)
243 
244 #define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
245 
246 #define KSEQ_DECLARE(type_t) \
247  __KS_TYPE(type_t) \
248  __KSEQ_TYPE(type_t) \
249  extern kseq_t *kseq_init(type_t fd); \
250  void kseq_destroy(kseq_t *ks); \
251  int kseq_read(kseq_t *seq);
252 
253 #endif