NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
faidx.c
Go to the documentation of this file.
1 #include "config.h"
2 
3 #include <ctype.h>
4 #include <string.h>
5 #include <stdlib.h>
6 #include <stdio.h>
7 #include <stdint.h>
8 
9 #include "htslib/bgzf.h"
10 #include "htslib/faidx.h"
11 #include "htslib/khash.h"
12 #ifdef _USE_KNETFILE
13 #include "htslib/knetfile.h"
14 #endif
15 
16 typedef struct {
17  int32_t line_len, line_blen;
20 } faidx1_t;
22 
23 struct __faidx_t {
25  int n, m;
26  char **name;
27  khash_t(s) *hash;
28 };
29 
30 #ifndef kroundup32
31 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
32 #endif
33 
34 static inline void fai_insert_index(faidx_t *idx, const char *name, int len, int line_len, int line_blen, uint64_t offset)
35 {
36  khint_t k;
37  int ret;
38  faidx1_t t;
39  if (idx->n == idx->m) {
40  idx->m = idx->m? idx->m<<1 : 16;
41  idx->name = (char**)realloc(idx->name, sizeof(char*) * idx->m);
42  }
43  idx->name[idx->n] = strdup(name);
44  k = kh_put(s, idx->hash, idx->name[idx->n], &ret);
45  t.len = len; t.line_len = line_len; t.line_blen = line_blen; t.offset = offset;
46  kh_value(idx->hash, k) = t;
47  ++idx->n;
48 }
49 
51 {
52  char c, *name;
53  int l_name, m_name;
54  int line_len, line_blen, state;
55  int l1, l2;
56  faidx_t *idx;
57  uint64_t offset;
58  int64_t len;
59 
60  idx = (faidx_t*)calloc(1, sizeof(faidx_t));
61  idx->hash = kh_init(s);
62  name = 0; l_name = m_name = 0;
63  len = line_len = line_blen = -1; state = 0; l1 = l2 = -1; offset = 0;
64  while ( (c=bgzf_getc(bgzf))>=0 ) {
65  if (c == '\n') { // an empty line
66  if (state == 1) {
67  offset = bgzf_utell(bgzf);
68  continue;
69  } else if ((state == 0 && len < 0) || state == 2) continue;
70  }
71  if (c == '>') { // fasta header
72  if (len >= 0)
73  fai_insert_index(idx, name, len, line_len, line_blen, offset);
74  l_name = 0;
75  while ( (c=bgzf_getc(bgzf))>=0 && !isspace(c)) {
76  if (m_name < l_name + 2) {
77  m_name = l_name + 2;
78  kroundup32(m_name);
79  name = (char*)realloc(name, m_name);
80  }
81  name[l_name++] = c;
82  }
83  name[l_name] = '\0';
84  if ( c<0 ) {
85  fprintf(stderr, "[fai_build_core] the last entry has no sequence\n");
86  free(name); fai_destroy(idx);
87  return 0;
88  }
89  if (c != '\n') while ( (c=bgzf_getc(bgzf))>=0 && c != '\n');
90  state = 1; len = 0;
91  offset = bgzf_utell(bgzf);
92  } else {
93  if (state == 3) {
94  fprintf(stderr, "[fai_build_core] inlined empty line is not allowed in sequence '%s'.\n", name);
95  free(name); fai_destroy(idx);
96  return 0;
97  }
98  if (state == 2) state = 3;
99  l1 = l2 = 0;
100  do {
101  ++l1;
102  if (isgraph(c)) ++l2;
103  } while ( (c=bgzf_getc(bgzf))>=0 && c != '\n');
104  if (state == 3 && l2) {
105  fprintf(stderr, "[fai_build_core] different line length in sequence '%s'.\n", name);
106  free(name); fai_destroy(idx);
107  return 0;
108  }
109  ++l1; len += l2;
110  if (state == 1) line_len = l1, line_blen = l2, state = 0;
111  else if (state == 0) {
112  if (l1 != line_len || l2 != line_blen) state = 2;
113  }
114  }
115  }
116  fai_insert_index(idx, name, len, line_len, line_blen, offset);
117  free(name);
118  return idx;
119 }
120 
121 void fai_save(const faidx_t *fai, FILE *fp)
122 {
123  khint_t k;
124  int i;
125  for (i = 0; i < fai->n; ++i) {
126  faidx1_t x;
127  k = kh_get(s, fai->hash, fai->name[i]);
128  x = kh_value(fai->hash, k);
129 #ifdef _WIN32
130  fprintf(fp, "%s\t%d\t%ld\t%d\t%d\n", fai->name[i], (int)x.len, (long)x.offset, (int)x.line_blen, (int)x.line_len);
131 #else
132  fprintf(fp, "%s\t%d\t%lld\t%d\t%d\n", fai->name[i], (int)x.len, (long long)x.offset, (int)x.line_blen, (int)x.line_len);
133 #endif
134  }
135 }
136 
137 faidx_t *fai_read(FILE *fp)
138 {
139  faidx_t *fai;
140  char *buf, *p;
141  int len, line_len, line_blen;
142 #ifdef _WIN32
143  long offset;
144 #else
145  long long offset;
146 #endif
147  fai = (faidx_t*)calloc(1, sizeof(faidx_t));
148  fai->hash = kh_init(s);
149  buf = (char*)calloc(0x10000, 1);
150  while (!feof(fp) && fgets(buf, 0x10000, fp)) {
151  for (p = buf; *p && isgraph(*p); ++p);
152  *p = 0; ++p;
153 #ifdef _WIN32
154  sscanf(p, "%d%ld%d%d", &len, &offset, &line_blen, &line_len);
155 #else
156  sscanf(p, "%d%lld%d%d", &len, &offset, &line_blen, &line_len);
157 #endif
158  fai_insert_index(fai, buf, len, line_len, line_blen, offset);
159  }
160  free(buf);
161  return fai;
162 }
163 
165 {
166  int i;
167  for (i = 0; i < fai->n; ++i) free(fai->name[i]);
168  free(fai->name);
169  kh_destroy(s, fai->hash);
170  if (fai->bgzf) bgzf_close(fai->bgzf);
171  free(fai);
172 }
173 
174 int fai_build(const char *fn)
175 {
176  char *str;
177  BGZF *bgzf;
178  FILE *fp;
179  faidx_t *fai;
180  str = (char*)calloc(strlen(fn) + 5, 1);
181  sprintf(str, "%s.fai", fn);
182  bgzf = bgzf_open(fn, "r");
183  if ( !bgzf ) {
184  fprintf(stderr, "[fai_build] fail to open the FASTA file %s\n",fn);
185  free(str);
186  return -1;
187  }
188  if ( bgzf->is_compressed ) bgzf_index_build_init(bgzf);
189  fai = fai_build_core(bgzf);
190  if ( bgzf->is_compressed ) bgzf_index_dump(bgzf, fn, ".gzi");
191  bgzf_close(bgzf);
192  fp = fopen(str, "wb");
193  if ( !fp ) {
194  fprintf(stderr, "[fai_build] fail to write FASTA index %s\n",str);
195  fai_destroy(fai); free(str);
196  return -1;
197  }
198  fai_save(fai, fp);
199  fclose(fp);
200  free(str);
201  fai_destroy(fai);
202  return 0;
203 }
204 
205 #ifdef _USE_KNETFILE
206 FILE *download_and_open(const char *fn)
207 {
208  const int buf_size = 1 * 1024 * 1024;
209  uint8_t *buf;
210  FILE *fp;
211  knetFile *fp_remote;
212  const char *url = fn;
213  const char *p;
214  int l = strlen(fn);
215  for (p = fn + l - 1; p >= fn; --p)
216  if (*p == '/') break;
217  fn = p + 1;
218 
219  // First try to open a local copy
220  fp = fopen(fn, "r");
221  if (fp)
222  return fp;
223 
224  // If failed, download from remote and open
225  fp_remote = knet_open(url, "rb");
226  if (fp_remote == 0) {
227  fprintf(stderr, "[download_from_remote] fail to open remote file %s\n",url);
228  return NULL;
229  }
230  if ((fp = fopen(fn, "wb")) == 0) {
231  fprintf(stderr, "[download_from_remote] fail to create file in the working directory %s\n",fn);
232  knet_close(fp_remote);
233  return NULL;
234  }
235  buf = (uint8_t*)calloc(buf_size, 1);
236  while ((l = knet_read(fp_remote, buf, buf_size)) != 0)
237  fwrite(buf, 1, l, fp);
238  free(buf);
239  fclose(fp);
240  knet_close(fp_remote);
241 
242  return fopen(fn, "r");
243 }
244 #endif
245 
246 faidx_t *fai_load(const char *fn)
247 {
248  char *str;
249  FILE *fp;
250  faidx_t *fai;
251  str = (char*)calloc(strlen(fn) + 5, 1);
252  sprintf(str, "%s.fai", fn);
253 
254 #ifdef _USE_KNETFILE
255  if (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)
256  {
257  fp = download_and_open(str);
258  if ( !fp )
259  {
260  fprintf(stderr, "[fai_load] failed to open remote FASTA index %s\n", str);
261  free(str);
262  return 0;
263  }
264  }
265  else
266 #endif
267  fp = fopen(str, "rb");
268  if (fp == 0) {
269  fprintf(stderr, "[fai_load] build FASTA index.\n");
270  fai_build(fn);
271  fp = fopen(str, "rb");
272  if (fp == 0) {
273  fprintf(stderr, "[fai_load] fail to open FASTA index.\n");
274  free(str);
275  return 0;
276  }
277  }
278 
279  fai = fai_read(fp);
280  fclose(fp);
281 
282  fai->bgzf = bgzf_open(fn, "rb");
283  free(str);
284  if (fai->bgzf == 0) {
285  fprintf(stderr, "[fai_load] fail to open FASTA file.\n");
286  return 0;
287  }
288  if ( fai->bgzf->is_compressed==1 )
289  {
290  if ( bgzf_index_load(fai->bgzf, fn, ".gzi") < 0 )
291  {
292  fprintf(stderr, "[fai_load] failed to load .gzi index: %s[.gzi]\n", fn);
293  fai_destroy(fai);
294  return NULL;
295  }
296  }
297  return fai;
298 }
299 
300 char *fai_fetch(const faidx_t *fai, const char *str, int *len)
301 {
302  char *s, c;
303  int i, l, k, name_end;
304  khiter_t iter;
305  faidx1_t val;
306  khash_t(s) *h;
307  int beg, end;
308 
309  beg = end = -1;
310  h = fai->hash;
311  name_end = l = strlen(str);
312  s = (char*)malloc(l+1);
313  // remove space
314  for (i = k = 0; i < l; ++i)
315  if (!isspace(str[i])) s[k++] = str[i];
316  s[k] = 0; l = k;
317  // determine the sequence name
318  for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end
319  if (i >= 0) name_end = i;
320  if (name_end < l) { // check if this is really the end
321  int n_hyphen = 0;
322  for (i = name_end + 1; i < l; ++i) {
323  if (s[i] == '-') ++n_hyphen;
324  else if (!isdigit(s[i]) && s[i] != ',') break;
325  }
326  if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name
327  s[name_end] = 0;
328  iter = kh_get(s, h, s);
329  if (iter == kh_end(h)) { // cannot find the sequence name
330  iter = kh_get(s, h, str); // try str as the name
331  if (iter == kh_end(h)) {
332  *len = 0;
333  free(s); return 0;
334  } else s[name_end] = ':', name_end = l;
335  }
336  } else iter = kh_get(s, h, str);
337  if(iter == kh_end(h)) {
338  fprintf(stderr, "[fai_fetch] Warning - Reference %s not found in FASTA file, returning empty sequence\n", str);
339  free(s);
340  *len = -2;
341  return 0;
342  };
343  val = kh_value(h, iter);
344  // parse the interval
345  if (name_end < l) {
346  for (i = k = name_end + 1; i < l; ++i)
347  if (s[i] != ',') s[k++] = s[i];
348  s[k] = 0;
349  beg = atoi(s + name_end + 1);
350  for (i = name_end + 1; i != k; ++i) if (s[i] == '-') break;
351  end = i < k? atoi(s + i + 1) : val.len;
352  if (beg > 0) --beg;
353  } else beg = 0, end = val.len;
354  if (beg >= val.len) beg = val.len;
355  if (end >= val.len) end = val.len;
356  if (beg > end) beg = end;
357  free(s);
358 
359  // now retrieve the sequence
360  int ret = bgzf_useek(fai->bgzf, val.offset + beg / val.line_blen * val.line_len + beg % val.line_blen, SEEK_SET);
361  if ( ret<0 )
362  {
363  *len = -1;
364  fprintf(stderr, "[fai_fetch] Error: fai_fetch failed. (Seeking in a compressed, .gzi unindexed, file?)\n");
365  return NULL;
366  }
367  l = 0;
368  s = (char*)malloc(end - beg + 2);
369  while ( (c=bgzf_getc(fai->bgzf))>=0 && l < end - beg )
370  if (isgraph(c)) s[l++] = c;
371  s[l] = '\0';
372  *len = l;
373  return s;
374 }
375 
376 int faidx_fetch_nseq(const faidx_t *fai)
377 {
378  return fai->n;
379 }
380 
381 char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len)
382 {
383  int l;
384  char c;
385  khiter_t iter;
386  faidx1_t val;
387  char *seq=NULL;
388 
389  // Adjust position
390  iter = kh_get(s, fai->hash, c_name);
391  if (iter == kh_end(fai->hash))
392  {
393  *len = -2;
394  fprintf(stderr, "[fai_fetch_seq] The sequence \"%s\" not found\n", c_name);
395  return NULL;
396  }
397  val = kh_value(fai->hash, iter);
398  if(p_end_i < p_beg_i) p_beg_i = p_end_i;
399  if(p_beg_i < 0) p_beg_i = 0;
400  else if(val.len <= p_beg_i) p_beg_i = val.len - 1;
401  if(p_end_i < 0) p_end_i = 0;
402  else if(val.len <= p_end_i) p_end_i = val.len - 1;
403 
404  // Now retrieve the sequence
405  int ret = bgzf_useek(fai->bgzf, val.offset + p_beg_i / val.line_blen * val.line_len + p_beg_i % val.line_blen, SEEK_SET);
406  if ( ret<0 )
407  {
408  *len = -1;
409  fprintf(stderr, "[fai_fetch_seq] Error: fai_fetch failed. (Seeking in a compressed, .gzi unindexed, file?)\n");
410  return NULL;
411  }
412  l = 0;
413  seq = (char*)malloc(p_end_i - p_beg_i + 2);
414  while ( (c=bgzf_getc(fai->bgzf))>=0 && l < p_end_i - p_beg_i + 1)
415  if (isgraph(c)) seq[l++] = c;
416  seq[l] = '\0';
417  *len = l;
418  return seq;
419 }
420 
421