NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
hts.c
Go to the documentation of this file.
1 #include <zlib.h>
2 #include <ctype.h>
3 #include <stdio.h>
4 #include <string.h>
5 #include <stdlib.h>
6 #include <fcntl.h>
7 #include <errno.h>
8 #include <sys/stat.h>
9 #include "htslib/bgzf.h"
10 #include "htslib/hts.h"
11 #include "cram/cram.h"
12 #include "htslib/hfile.h"
13 #include "version.h"
14 
15 #include "htslib/kseq.h"
16 #define KS_BGZF 1
17 #if KS_BGZF
18  // bgzf now supports gzip-compressed files
19  KSTREAM_INIT2(, BGZF*, bgzf_read, 65536)
20 #else
21  KSTREAM_INIT2(, gzFile, gzread, 16384)
22 #endif
23 
24 #include "htslib/khash.h"
26 
27 int hts_verbose = 3;
28 
30 {
31  return HTS_VERSION;
32 }
33 
34 const unsigned char seq_nt16_table[256] = {
35  15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
36  15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
37  15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
38  1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15,
39  15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
40  15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15,
41  15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
42  15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15,
43 
44  15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
45  15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
46  15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
47  15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
48  15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
49  15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
50  15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
51  15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
52 };
53 
54 const char seq_nt16_str[] = "=ACMGRSVTWYHKDBN";
55 
56 /**********************
57  *** Basic file I/O ***
58  **********************/
59 
60 // Decompress up to ten or so bytes by peeking at the file, which must be
61 // positioned at the start of a GZIP block.
62 static size_t decompress_peek(hFILE *fp, unsigned char *dest, size_t destsize)
63 {
64  // Typically at most a couple of hundred bytes of input are required
65  // to get a few bytes of output from inflate(), so hopefully this buffer
66  // size suffices in general.
67  unsigned char buffer[512];
68  z_stream zs;
69  ssize_t npeek = hpeek(fp, buffer, sizeof buffer);
70 
71  if (npeek < 0) return 0;
72 
73  zs.zalloc = NULL;
74  zs.zfree = NULL;
75  zs.next_in = buffer;
76  zs.avail_in = npeek;
77  zs.next_out = dest;
78  zs.avail_out = destsize;
79  if (inflateInit2(&zs, 31) != Z_OK) return 0;
80 
81  while (zs.total_out < destsize)
82  if (inflate(&zs, Z_SYNC_FLUSH) != Z_OK) break;
83 
84  destsize = zs.total_out;
85  inflateEnd(&zs);
86 
87  return destsize;
88 }
89 
90 // Returns whether the block contains any control characters, i.e.,
91 // characters less than SPACE other than whitespace etc (ASCII BEL..CR).
92 static int is_binary(unsigned char *s, size_t n)
93 {
94  size_t i;
95  for (i = 0; i < n; i++)
96  if (s[i] < 0x07 || (s[i] >= 0x0e && s[i] < 0x20)) return 1;
97  return 0;
98 }
99 
100 htsFile *hts_open(const char *fn, const char *mode)
101 {
102  htsFile *fp = NULL;
103  hFILE *hfile = hopen(fn, mode);
104  if (hfile == NULL) goto error;
105 
106  fp = (htsFile*)calloc(1, sizeof(htsFile));
107  if (fp == NULL) goto error;
108 
109  fp->fn = strdup(fn);
110  fp->is_be = ed_is_big();
111 
112  if (strchr(mode, 'r')) {
113  unsigned char s[18];
114  if (hpeek(hfile, s, 6) == 6 && memcmp(s, "CRAM", 4) == 0 &&
115  s[4] >= 1 && s[4] <= 2 && s[5] <= 1) {
116  fp->is_cram = 1;
117  }
118  else if (hpeek(hfile, s, 18) == 18 && s[0] == 0x1f && s[1] == 0x8b &&
119  (s[3] & 4) && memcmp(&s[12], "BC\2\0", 4) == 0) {
120  // The stream is BGZF-compressed. Decompress a few bytes to see
121  // whether it's in a binary format (e.g., BAM or BCF, starting
122  // with four bytes of magic including a control character) or is
123  // a bgzipped SAM or VCF text file.
124  fp->is_compressed = 1;
125  if (is_binary(s, decompress_peek(hfile, s, 4))) fp->is_bin = 1;
126  else fp->is_kstream = 1;
127  }
128  else if (hpeek(hfile, s, 2) == 2 && s[0] == 0x1f && s[1] == 0x8b) {
129  // Plain GZIP header... so a gzipped text file.
130  fp->is_compressed = 1;
131  fp->is_kstream = 1;
132  }
133  else if (hpeek(hfile, s, 4) == 4 && is_binary(s, 4)) {
134  // Binary format, but in a raw non-compressed form.
135  fp->is_bin = 1;
136  }
137  else {
138  fp->is_kstream = 1;
139  }
140  }
141  else if (strchr(mode, 'w') || strchr(mode, 'a')) {
142  fp->is_write = 1;
143  if (strchr(mode, 'b')) fp->is_bin = 1;
144  if (strchr(mode, 'c')) fp->is_cram = 1;
145  if (strchr(mode, 'z')) fp->is_compressed = 1;
146  else if (strchr(mode, 'u')) fp->is_compressed = 0;
147  else fp->is_compressed = 2; // not set, default behaviour
148  }
149  else goto error;
150 
151  if (fp->is_bin || (fp->is_write && fp->is_compressed==1)) {
152  fp->fp.bgzf = bgzf_hopen(hfile, mode);
153  if (fp->fp.bgzf == NULL) goto error;
154  }
155  else if (fp->is_cram) {
156  fp->fp.cram = cram_dopen(hfile, fn, mode);
157  if (fp->fp.cram == NULL) goto error;
158  if (!fp->is_write)
160 
161  }
162  else if (fp->is_kstream) {
163  #if KS_BGZF
164  BGZF *gzfp = bgzf_hopen(hfile, mode);
165  #else
166  // TODO Implement gzip hFILE adaptor
167  hclose(hfile); // This won't work, especially for stdin
168  gzFile gzfp = strcmp(fn, "-")? gzopen(fn, "rb") : gzdopen(fileno(stdin), "rb");
169  #endif
170  if (gzfp) fp->fp.voidp = ks_init(gzfp);
171  else goto error;
172  }
173  else {
174  fp->fp.hfile = hfile;
175  }
176 
177  return fp;
178 
179 error:
180  if (hts_verbose >= 2)
181  fprintf(stderr, "[E::%s] fail to open file '%s'\n", __func__, fn);
182 
183  if (hfile)
184  hclose_abruptly(hfile);
185 
186  if (fp) {
187  free(fp->fn);
188  free(fp->fn_aux);
189  free(fp);
190  }
191  return NULL;
192 }
193 
195 {
196  int ret, save;
197 
198  if (fp->is_bin || (fp->is_write && fp->is_compressed==1)) {
199  ret = bgzf_close(fp->fp.bgzf);
200  } else if (fp->is_cram) {
201  if (!fp->is_write) {
202  switch (cram_eof(fp->fp.cram)) {
203  case 0:
204  fprintf(stderr, "[E::%s] Failed to decode sequence.\n", __func__);
205  return -1;
206  case 2:
207  fprintf(stderr, "[W::%s] EOF marker is absent. The input is probably truncated.\n", __func__);
208  break;
209  default: /* case 1, expected EOF */
210  break;
211  }
212  }
213  ret = cram_close(fp->fp.cram);
214  } else if (fp->is_kstream) {
215  #if KS_BGZF
216  BGZF *gzfp = ((kstream_t*)fp->fp.voidp)->f;
217  ret = bgzf_close(gzfp);
218  #else
219  gzFile gzfp = ((kstream_t*)fp->fp.voidp)->f;
220  ret = gzclose(gzfp);
221  #endif
222  ks_destroy((kstream_t*)fp->fp.voidp);
223  } else {
224  ret = hclose(fp->fp.hfile);
225  }
226 
227  save = errno;
228  free(fp->fn);
229  free(fp->fn_aux);
230  free(fp->line.s);
231  free(fp);
232  errno = save;
233  return ret;
234 }
235 
236 int hts_set_threads(htsFile *fp, int n)
237 {
238  // TODO Plug in CRAM and other threading
239  if (fp->is_bin) {
240  return bgzf_mt(fp->fp.bgzf, n, 256);
241  }
242  else return 0;
243 }
244 
245 int hts_set_fai_filename(htsFile *fp, const char *fn_aux)
246 {
247  free(fp->fn_aux);
248  if (fn_aux) {
249  fp->fn_aux = strdup(fn_aux);
250  if (fp->fn_aux == NULL) return -1;
251  }
252  else fp->fn_aux = NULL;
253 
254  return 0;
255 }
256 
257 // For VCF/BCF backward sweeper. Not exposing these functions because their
258 // future is uncertain. Things will probably have to change with hFILE...
260 {
261  if ( fp->is_bin )
262  return fp->fp.bgzf;
263  else
264  return ((kstream_t*)fp->fp.voidp)->f;
265 }
266 int hts_useek(htsFile *fp, long uoffset, int where)
267 {
268  if ( fp->is_bin )
269  return bgzf_useek(fp->fp.bgzf, uoffset, where);
270  else
271  {
272  ks_rewind((kstream_t*)fp->fp.voidp);
273  ((kstream_t*)fp->fp.voidp)->seek_pos = uoffset;
274  return bgzf_useek(((kstream_t*)fp->fp.voidp)->f, uoffset, where);
275  }
276 }
278 {
279  if ( fp->is_bin )
280  return bgzf_utell(fp->fp.bgzf);
281  else
282  return ((kstream_t*)fp->fp.voidp)->seek_pos;
283 }
284 
285 int hts_getline(htsFile *fp, int delimiter, kstring_t *str)
286 {
287  int ret, dret;
288  ret = ks_getuntil((kstream_t*)fp->fp.voidp, delimiter, str, &dret);
289  ++fp->lineno;
290  return ret;
291 }
292 
293 char **hts_readlist(const char *string, int is_file, int *_n)
294 {
295  int m = 0, n = 0, dret;
296  char **s = 0;
297  if ( is_file )
298  {
299 #if KS_BGZF
300  BGZF *fp = bgzf_open(string, "r");
301 #else
302  gzFile fp = gzopen(string, "r");
303 #endif
304  if ( !fp ) return NULL;
305 
306  kstream_t *ks;
307  kstring_t str;
308  str.s = 0; str.l = str.m = 0;
309  ks = ks_init(fp);
310  while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0)
311  {
312  if (str.l == 0) continue;
313  n++;
314  hts_expand(char*,n,m,s);
315  s[n-1] = strdup(str.s);
316  }
317  ks_destroy(ks);
318 #if KS_BGZF
319  bgzf_close(fp);
320 #else
321  gzclose(fp);
322 #endif
323  free(str.s);
324  }
325  else
326  {
327  const char *q = string, *p = string;
328  while ( 1 )
329  {
330  if (*p == ',' || *p == 0)
331  {
332  n++;
333  hts_expand(char*,n,m,s);
334  s[n-1] = (char*)calloc(p - q + 1, 1);
335  strncpy(s[n-1], q, p - q);
336  q = p + 1;
337  }
338  if ( !*p ) break;
339  p++;
340  }
341  }
342  s = (char**)realloc(s, n * sizeof(char*));
343  *_n = n;
344  return s;
345 }
346 
347 char **hts_readlines(const char *fn, int *_n)
348 {
349  int m = 0, n = 0, dret;
350  char **s = 0;
351 #if KS_BGZF
352  BGZF *fp = bgzf_open(fn, "r");
353 #else
354  gzFile fp = gzopen(fn, "r");
355 #endif
356  if ( fp ) { // read from file
357  kstream_t *ks;
358  kstring_t str;
359  str.s = 0; str.l = str.m = 0;
360  ks = ks_init(fp);
361  while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) {
362  if (str.l == 0) continue;
363  if (m == n) {
364  m = m? m<<1 : 16;
365  s = (char**)realloc(s, m * sizeof(char*));
366  }
367  s[n++] = strdup(str.s);
368  }
369  ks_destroy(ks);
370  #if KS_BGZF
371  bgzf_close(fp);
372  #else
373  gzclose(fp);
374  #endif
375  s = (char**)realloc(s, n * sizeof(char*));
376  free(str.s);
377  } else if (*fn == ':') { // read from string
378  const char *q, *p;
379  for (q = p = fn + 1;; ++p)
380  if (*p == ',' || *p == 0) {
381  if (m == n) {
382  m = m? m<<1 : 16;
383  s = (char**)realloc(s, m * sizeof(char*));
384  }
385  s[n] = (char*)calloc(p - q + 1, 1);
386  strncpy(s[n++], q, p - q);
387  q = p + 1;
388  if (*p == 0) break;
389  }
390  } else return 0;
391  s = (char**)realloc(s, n * sizeof(char*));
392  *_n = n;
393  return s;
394 }
395 
396 int hts_file_type(const char *fname)
397 {
398  int len = strlen(fname);
399  if ( !strcasecmp(".vcf.gz",fname+len-7) ) return FT_VCF_GZ;
400  if ( !strcasecmp(".vcf",fname+len-4) ) return FT_VCF;
401  if ( !strcasecmp(".bcf",fname+len-4) ) return FT_BCF_GZ;
402  if ( !strcmp("-",fname) ) return FT_STDIN;
403  // ... etc
404 
405  int fd = open(fname, O_RDONLY);
406  if ( !fd ) return 0;
407 
408  uint8_t magic[5];
409  if ( read(fd,magic,2)!=2 ) { close(fd); return 0; }
410  if ( !strncmp((char*)magic,"##",2) ) { close(fd); return FT_VCF; }
411  if ( !strncmp((char*)magic,"BCF",3) ) { close(fd); return FT_BCF; }
412  close(fd);
413 
414  if ( magic[0]==0x1f && magic[1]==0x8b ) // compressed
415  {
416  BGZF *fp = bgzf_open(fname, "r");
417  if ( !fp ) return 0;
418  if ( bgzf_read(fp, magic, 3)!=3 ) { bgzf_close(fp); return 0; }
419  bgzf_close(fp);
420  if ( !strncmp((char*)magic,"##",2) ) return FT_VCF;
421  if ( !strncmp((char*)magic,"BCF",3) ) return FT_BCF_GZ;
422  }
423  return 0;
424 }
425 
426 /****************
427  *** Indexing ***
428  ****************/
429 
430 #define HTS_MIN_MARKER_DIST 0x10000
431 
432 // Finds the special meta bin
433 // ((1<<(3 * n_lvls + 3)) - 1) / 7 + 1
434 #define META_BIN(idx) ((idx)->n_bins + 1)
435 
436 #define pair64_lt(a,b) ((a).u < (b).u)
437 
438 #include "htslib/ksort.h"
440 
441 typedef struct {
442  int32_t m, n;
445 } bins_t;
446 
447 #include "htslib/khash.h"
449 typedef khash_t(bin) bidx_t;
450 
451 typedef struct {
452  int32_t n, m;
453  uint64_t *offset;
455 
456 struct __hts_idx_t {
461  bidx_t **bidx;
464  struct {
470  } z; // keep internal states
471 };
472 
473 static inline void insert_to_b(bidx_t *b, int bin, uint64_t beg, uint64_t end)
474 {
475  khint_t k;
476  bins_t *l;
477  int absent;
478  k = kh_put(bin, b, bin, &absent);
479  l = &kh_value(b, k);
480  if (absent) {
481  l->m = 1; l->n = 0;
482  l->list = (hts_pair64_t*)calloc(l->m, 16);
483  }
484  if (l->n == l->m) {
485  l->m <<= 1;
486  l->list = (hts_pair64_t*)realloc(l->list, l->m * 16);
487  }
488  l->list[l->n].u = beg;
489  l->list[l->n++].v = end;
490 }
491 
492 static inline void insert_to_l(lidx_t *l, int64_t _beg, int64_t _end, uint64_t offset, int min_shift)
493 {
494  int i, beg, end;
495  beg = _beg >> min_shift;
496  end = (_end - 1) >> min_shift;
497  if (l->m < end + 1) {
498  int old_m = l->m;
499  l->m = end + 1;
500  kroundup32(l->m);
501  l->offset = (uint64_t*)realloc(l->offset, l->m * 8);
502  memset(l->offset + old_m, 0xff, 8 * (l->m - old_m)); // fill l->offset with (uint64_t)-1
503  }
504  if (beg == end) { // to save a loop in this case
505  if (l->offset[beg] == (uint64_t)-1) l->offset[beg] = offset;
506  } else {
507  for (i = beg; i <= end; ++i)
508  if (l->offset[i] == (uint64_t)-1) l->offset[i] = offset;
509  }
510  if (l->n < end + 1) l->n = end + 1;
511 }
512 
513 hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls)
514 {
515  hts_idx_t *idx;
516  idx = (hts_idx_t*)calloc(1, sizeof(hts_idx_t));
517  if (idx == NULL) return NULL;
518  idx->fmt = fmt;
519  idx->min_shift = min_shift;
520  idx->n_lvls = n_lvls;
521  idx->n_bins = ((1<<(3 * n_lvls + 3)) - 1) / 7;
522  idx->z.save_bin = idx->z.save_tid = idx->z.last_tid = idx->z.last_bin = 0xffffffffu;
523  idx->z.save_off = idx->z.last_off = idx->z.off_beg = idx->z.off_end = offset0;
524  idx->z.last_coor = 0xffffffffu;
525  if (n) {
526  idx->n = idx->m = n;
527  idx->bidx = (bidx_t**)calloc(n, sizeof(bidx_t*));
528  if (idx->bidx == NULL) { free(idx); return NULL; }
529  idx->lidx = (lidx_t*) calloc(n, sizeof(lidx_t));
530  if (idx->lidx == NULL) { free(idx->bidx); free(idx); return NULL; }
531  }
532  return idx;
533 }
534 
535 static void update_loff(hts_idx_t *idx, int i, int free_lidx)
536 {
537  bidx_t *bidx = idx->bidx[i];
538  lidx_t *lidx = &idx->lidx[i];
539  khint_t k;
540  int l;
541  uint64_t offset0 = 0;
542  if (bidx) {
543  k = kh_get(bin, bidx, META_BIN(idx));
544  if (k != kh_end(bidx))
545  offset0 = kh_val(bidx, k).list[0].u;
546  for (l = 0; l < lidx->n && lidx->offset[l] == (uint64_t)-1; ++l)
547  lidx->offset[l] = offset0;
548  } else l = 1;
549  for (; l < lidx->n; ++l) // fill missing values
550  if (lidx->offset[l] == (uint64_t)-1)
551  lidx->offset[l] = lidx->offset[l-1];
552  if (bidx == 0) return;
553  for (k = kh_begin(bidx); k != kh_end(bidx); ++k) // set loff
554  if (kh_exist(bidx, k))
555  {
556  if ( kh_key(bidx, k) < idx->n_bins )
557  {
558  int bot_bin = hts_bin_bot(kh_key(bidx, k), idx->n_lvls);
559  // disable linear index if bot_bin out of bounds
560  kh_val(bidx, k).loff = bot_bin < lidx->n ? lidx->offset[bot_bin] : 0;
561  }
562  else
563  kh_val(bidx, k).loff = 0;
564  }
565  if (free_lidx) {
566  free(lidx->offset);
567  lidx->m = lidx->n = 0;
568  lidx->offset = 0;
569  }
570 }
571 
572 static void compress_binning(hts_idx_t *idx, int i)
573 {
574  bidx_t *bidx = idx->bidx[i];
575  khint_t k;
576  int l, m;
577  if (bidx == 0) return;
578  // merge a bin to its parent if the bin is too small
579  for (l = idx->n_lvls; l > 0; --l) {
580  unsigned start = hts_bin_first(l);
581  for (k = kh_begin(bidx); k != kh_end(bidx); ++k) {
582  bins_t *p, *q;
583  if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins || kh_key(bidx, k) < start) continue;
584  p = &kh_value(bidx, k);
585  if (l < idx->n_lvls && p->n > 1) ks_introsort(_off, p->n, p->list);
586  if ((p->list[p->n - 1].v>>16) - (p->list[0].u>>16) < HTS_MIN_MARKER_DIST) {
587  khint_t kp;
588  kp = kh_get(bin, bidx, hts_bin_parent(kh_key(bidx, k)));
589  if (kp == kh_end(bidx)) continue;
590  q = &kh_val(bidx, kp);
591  if (q->n + p->n > q->m) {
592  q->m = q->n + p->n;
593  kroundup32(q->m);
594  q->list = (hts_pair64_t*)realloc(q->list, q->m * 16);
595  }
596  memcpy(q->list + q->n, p->list, p->n * 16);
597  q->n += p->n;
598  free(p->list);
599  kh_del(bin, bidx, k);
600  }
601  }
602  }
603  k = kh_get(bin, bidx, 0);
604  if (k != kh_end(bidx)) ks_introsort(_off, kh_val(bidx, k).n, kh_val(bidx, k).list);
605  // merge adjacent chunks that start from the same BGZF block
606  for (k = kh_begin(bidx); k != kh_end(bidx); ++k) {
607  bins_t *p;
608  if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins) continue;
609  p = &kh_value(bidx, k);
610  for (l = 1, m = 0; l < p->n; ++l) {
611  if (p->list[m].v>>16 >= p->list[l].u>>16) {
612  if (p->list[m].v < p->list[l].v) p->list[m].v = p->list[l].v;
613  } else p->list[++m] = p->list[l];
614  }
615  p->n = m + 1;
616  }
617 }
618 
619 void hts_idx_finish(hts_idx_t *idx, uint64_t final_offset)
620 {
621  int i;
622  if (idx == NULL || idx->z.finished) return; // do not run this function on an empty index or multiple times
623  if (idx->z.save_tid >= 0) {
624  insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin, idx->z.save_off, final_offset);
625  insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.off_beg, final_offset);
626  insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.n_mapped, idx->z.n_unmapped);
627  }
628  for (i = 0; i < idx->n; ++i) {
629  update_loff(idx, i, (idx->fmt == HTS_FMT_CSI));
630  compress_binning(idx, i);
631  }
632  idx->z.finished = 1;
633 }
634 
635 int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int is_mapped)
636 {
637  int bin;
638  if (tid >= idx->m) { // enlarge the index
639  int32_t oldm = idx->m;
640  idx->m = idx->m? idx->m<<1 : 2;
641  idx->bidx = (bidx_t**)realloc(idx->bidx, idx->m * sizeof(bidx_t*));
642  idx->lidx = (lidx_t*) realloc(idx->lidx, idx->m * sizeof(lidx_t));
643  memset(&idx->bidx[oldm], 0, (idx->m - oldm) * sizeof(bidx_t*));
644  memset(&idx->lidx[oldm], 0, (idx->m - oldm) * sizeof(lidx_t));
645  }
646  if (idx->n < tid + 1) idx->n = tid + 1;
647  if (idx->z.finished) return 0;
648  if (idx->z.last_tid != tid || (idx->z.last_tid >= 0 && tid < 0)) { // change of chromosome
649  if ( tid>=0 && idx->n_no_coor )
650  {
651  if (hts_verbose >= 1) fprintf(stderr,"[E::%s] NO_COOR reads not in a single block at the end %d %d\n", __func__, tid,idx->z.last_tid);
652  return -1;
653  }
654  if (tid>=0 && idx->bidx[tid] != 0)
655  {
656  if (hts_verbose >= 1) fprintf(stderr, "[E::%s] chromosome blocks not continuous\n", __func__);
657  return -1;
658  }
659  idx->z.last_tid = tid;
660  idx->z.last_bin = 0xffffffffu;
661  } else if (tid >= 0 && idx->z.last_coor > beg) { // test if positions are out of order
662  if (hts_verbose >= 1) fprintf(stderr, "[E::%s] unsorted positions\n", __func__);
663  return -1;
664  }
665  if ( tid>=0 )
666  {
667  if (idx->bidx[tid] == 0) idx->bidx[tid] = kh_init(bin);
668  if ( is_mapped)
669  insert_to_l(&idx->lidx[tid], beg, end, idx->z.last_off, idx->min_shift); // last_off points to the start of the current record
670  }
671  else idx->n_no_coor++;
672  bin = hts_reg2bin(beg, end, idx->min_shift, idx->n_lvls);
673  if ((int)idx->z.last_bin != bin) { // then possibly write the binning index
674  if (idx->z.save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
675  insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin, idx->z.save_off, idx->z.last_off);
676  if (idx->z.last_bin == 0xffffffffu && idx->z.save_bin != 0xffffffffu) { // change of chr; keep meta information
677  idx->z.off_end = idx->z.last_off;
678  insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.off_beg, idx->z.off_end);
679  insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.n_mapped, idx->z.n_unmapped);
680  idx->z.n_mapped = idx->z.n_unmapped = 0;
681  idx->z.off_beg = idx->z.off_end;
682  }
683  idx->z.save_off = idx->z.last_off;
684  idx->z.save_bin = idx->z.last_bin = bin;
685  idx->z.save_tid = tid;
686  if (tid < 0) { // come to the end of the records having coordinates
687  hts_idx_finish(idx, offset);
688  return 0;
689  }
690  }
691  if (is_mapped) ++idx->z.n_mapped;
692  else ++idx->z.n_unmapped;
693  idx->z.last_off = offset;
694  idx->z.last_coor = beg;
695  return 0;
696 }
697 
699 {
700  khint_t k;
701  int i;
702  if (idx == 0) return;
703  // For HTS_FMT_CRAI, idx actually points to a different type -- see sam.c
704  if (idx->fmt == HTS_FMT_CRAI) { free(idx); return; }
705 
706  for (i = 0; i < idx->m; ++i) {
707  bidx_t *bidx = idx->bidx[i];
708  free(idx->lidx[i].offset);
709  if (bidx == 0) continue;
710  for (k = kh_begin(bidx); k != kh_end(bidx); ++k)
711  if (kh_exist(bidx, k))
712  free(kh_value(bidx, k).list);
713  kh_destroy(bin, bidx);
714  }
715  free(idx->bidx); free(idx->lidx); free(idx->meta);
716  free(idx);
717 }
718 
719 static inline long idx_read(int is_bgzf, void *fp, void *buf, long l)
720 {
721  if (is_bgzf) return bgzf_read((BGZF*)fp, buf, l);
722  else return (long)fread(buf, 1, l, (FILE*)fp);
723 }
724 
725 static inline long idx_write(int is_bgzf, void *fp, const void *buf, long l)
726 {
727  if (is_bgzf) return bgzf_write((BGZF*)fp, buf, l);
728  else return (long)fwrite(buf, 1, l, (FILE*)fp);
729 }
730 
731 static inline void swap_bins(bins_t *p)
732 {
733  int i;
734  for (i = 0; i < p->n; ++i) {
735  ed_swap_8p(&p->list[i].u);
736  ed_swap_8p(&p->list[i].v);
737  }
738 }
739 
740 static void hts_idx_save_core(const hts_idx_t *idx, void *fp, int fmt)
741 {
742  int32_t i, size, is_be;
743  int is_bgzf = (fmt != HTS_FMT_BAI);
744  is_be = ed_is_big();
745  if (is_be) {
746  uint32_t x = idx->n;
747  idx_write(is_bgzf, fp, ed_swap_4p(&x), 4);
748  } else idx_write(is_bgzf, fp, &idx->n, 4);
749  if (fmt == HTS_FMT_TBI && idx->l_meta) idx_write(is_bgzf, fp, idx->meta, idx->l_meta);
750  for (i = 0; i < idx->n; ++i) {
751  khint_t k;
752  bidx_t *bidx = idx->bidx[i];
753  lidx_t *lidx = &idx->lidx[i];
754  // write binning index
755  size = bidx? kh_size(bidx) : 0;
756  if (is_be) { // big endian
757  uint32_t x = size;
758  idx_write(is_bgzf, fp, ed_swap_4p(&x), 4);
759  } else idx_write(is_bgzf, fp, &size, 4);
760  if (bidx == 0) goto write_lidx;
761  for (k = kh_begin(bidx); k != kh_end(bidx); ++k) {
762  bins_t *p;
763  if (!kh_exist(bidx, k)) continue;
764  p = &kh_value(bidx, k);
765  if (is_be) { // big endian
766  uint32_t x;
767  x = kh_key(bidx, k); idx_write(is_bgzf, fp, ed_swap_4p(&x), 4);
768  if (fmt == HTS_FMT_CSI) {
769  uint64_t y = kh_val(bidx, k).loff;
770  idx_write(is_bgzf, fp, ed_swap_4p(&y), 8);
771  }
772  x = p->n; idx_write(is_bgzf, fp, ed_swap_4p(&x), 4);
773  swap_bins(p);
774  idx_write(is_bgzf, fp, p->list, 16 * p->n);
775  swap_bins(p);
776  } else {
777  idx_write(is_bgzf, fp, &kh_key(bidx, k), 4);
778  if (fmt == HTS_FMT_CSI) idx_write(is_bgzf, fp, &kh_val(bidx, k).loff, 8);
779  //int j;for(j=0;j<p->n;++j)fprintf(stderr,"%d,%llx,%d,%llx:%llx\n",kh_key(bidx,k),kh_val(bidx, k).loff,j,p->list[j].u,p->list[j].v);
780  idx_write(is_bgzf, fp, &p->n, 4);
781  idx_write(is_bgzf, fp, p->list, p->n << 4);
782  }
783  }
784 write_lidx:
785  if (fmt != HTS_FMT_CSI) {
786  if (is_be) {
787  int32_t x = lidx->n;
788  idx_write(is_bgzf, fp, ed_swap_4p(&x), 4);
789  for (x = 0; x < lidx->n; ++x) ed_swap_8p(&lidx->offset[x]);
790  idx_write(is_bgzf, fp, lidx->offset, lidx->n << 3);
791  for (x = 0; x < lidx->n; ++x) ed_swap_8p(&lidx->offset[x]);
792  } else {
793  idx_write(is_bgzf, fp, &lidx->n, 4);
794  idx_write(is_bgzf, fp, lidx->offset, lidx->n << 3);
795  }
796  }
797  }
798  if (is_be) { // write the number of reads without coordinates
799  uint64_t x = idx->n_no_coor;
800  idx_write(is_bgzf, fp, &x, 8);
801  } else idx_write(is_bgzf, fp, &idx->n_no_coor, 8);
802 }
803 
804 void hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt)
805 {
806  char *fnidx;
807  fnidx = (char*)calloc(1, strlen(fn) + 5);
808  strcpy(fnidx, fn);
809  if (fmt == HTS_FMT_CSI) {
810  BGZF *fp;
811  uint32_t x[3];
812  int is_be, i;
813  is_be = ed_is_big();
814  fp = bgzf_open(strcat(fnidx, ".csi"), "w");
815  bgzf_write(fp, "CSI\1", 4);
816  x[0] = idx->min_shift; x[1] = idx->n_lvls; x[2] = idx->l_meta;
817  if (is_be) {
818  for (i = 0; i < 3; ++i)
819  bgzf_write(fp, ed_swap_4p(&x[i]), 4);
820  } else bgzf_write(fp, &x, 12);
821  if (idx->l_meta) bgzf_write(fp, idx->meta, idx->l_meta);
822  hts_idx_save_core(idx, fp, HTS_FMT_CSI);
823  bgzf_close(fp);
824  } else if (fmt == HTS_FMT_TBI) {
825  BGZF *fp;
826  fp = bgzf_open(strcat(fnidx, ".tbi"), "w");
827  bgzf_write(fp, "TBI\1", 4);
828  hts_idx_save_core(idx, fp, HTS_FMT_TBI);
829  bgzf_close(fp);
830  } else if (fmt == HTS_FMT_BAI) {
831  FILE *fp;
832  fp = fopen(strcat(fnidx, ".bai"), "w");
833  fwrite("BAI\1", 1, 4, fp);
834  hts_idx_save_core(idx, fp, HTS_FMT_BAI);
835  fclose(fp);
836  } else abort();
837  free(fnidx);
838 }
839 
840 static int hts_idx_load_core(hts_idx_t *idx, void *fp, int fmt)
841 {
842  int32_t i, n, is_be;
843  int is_bgzf = (fmt != HTS_FMT_BAI);
844  is_be = ed_is_big();
845  if (idx == NULL) return -4;
846  for (i = 0; i < idx->n; ++i) {
847  bidx_t *h;
848  lidx_t *l = &idx->lidx[i];
849  uint32_t key;
850  int j, absent;
851  bins_t *p;
852  h = idx->bidx[i] = kh_init(bin);
853  if (idx_read(is_bgzf, fp, &n, 4) != 4) return -1;
854  if (is_be) ed_swap_4p(&n);
855  for (j = 0; j < n; ++j) {
856  khint_t k;
857  if (idx_read(is_bgzf, fp, &key, 4) != 4) return -1;
858  if (is_be) ed_swap_4p(&key);
859  k = kh_put(bin, h, key, &absent);
860  if (absent <= 0) return -3; // Duplicate bin number
861  p = &kh_val(h, k);
862  if (fmt == HTS_FMT_CSI) {
863  if (idx_read(is_bgzf, fp, &p->loff, 8) != 8) return -1;
864  if (is_be) ed_swap_8p(&p->loff);
865  } else p->loff = 0;
866  if (idx_read(is_bgzf, fp, &p->n, 4) != 4) return -1;
867  if (is_be) ed_swap_4p(&p->n);
868  p->m = p->n;
869  p->list = (hts_pair64_t*)malloc(p->m * 16);
870  if (p->list == NULL) return -2;
871  if (idx_read(is_bgzf, fp, p->list, p->n<<4) != p->n<<4) return -1;
872  if (is_be) swap_bins(p);
873  }
874  if (fmt != HTS_FMT_CSI) { // load linear index
875  int j;
876  if (idx_read(is_bgzf, fp, &l->n, 4) != 4) return -1;
877  if (is_be) ed_swap_4p(&l->n);
878  l->m = l->n;
879  l->offset = (uint64_t*)malloc(l->n << 3);
880  if (l->offset == NULL) return -2;
881  if (idx_read(is_bgzf, fp, l->offset, l->n << 3) != l->n << 3) return -1;
882  if (is_be) for (j = 0; j < l->n; ++j) ed_swap_8p(&l->offset[j]);
883  for (j = 1; j < l->n; ++j) // fill missing values; may happen given older samtools and tabix
884  if (l->offset[j] == 0) l->offset[j] = l->offset[j-1];
885  update_loff(idx, i, 1);
886  }
887  }
888  if (idx_read(is_bgzf, fp, &idx->n_no_coor, 8) != 8) idx->n_no_coor = 0;
889  if (is_be) ed_swap_8p(&idx->n_no_coor);
890  return 0;
891 }
892 
893 hts_idx_t *hts_idx_load_local(const char *fn, int fmt)
894 {
895  uint8_t magic[4];
896  int i, is_be;
897  hts_idx_t *idx = NULL;
898  is_be = ed_is_big();
899  if (fmt == HTS_FMT_CSI) {
900  BGZF *fp;
901  uint32_t x[3], n;
902  uint8_t *meta = 0;
903  if ((fp = bgzf_open(fn, "r")) == 0) return NULL;
904  if (bgzf_read(fp, magic, 4) != 4) goto csi_fail;
905  if (memcmp(magic, "CSI\1", 4) != 0) goto csi_fail;
906  if (bgzf_read(fp, x, 12) != 12) goto csi_fail;
907  if (is_be) for (i = 0; i < 3; ++i) ed_swap_4p(&x[i]);
908  if (x[2]) {
909  if ((meta = (uint8_t*)malloc(x[2])) == NULL) goto csi_fail;
910  if (bgzf_read(fp, meta, x[2]) != x[2]) goto csi_fail;
911  }
912  if (bgzf_read(fp, &n, 4) != 4) goto csi_fail;
913  if (is_be) ed_swap_4p(&n);
914  if ((idx = hts_idx_init(n, fmt, 0, x[0], x[1])) == NULL) goto csi_fail;
915  idx->l_meta = x[2];
916  idx->meta = meta;
917  meta = NULL;
918  if (hts_idx_load_core(idx, fp, HTS_FMT_CSI) < 0) goto csi_fail;
919  bgzf_close(fp);
920  return idx;
921 
922  csi_fail:
923  bgzf_close(fp);
924  hts_idx_destroy(idx);
925  free(meta);
926  return NULL;
927 
928  } else if (fmt == HTS_FMT_TBI) {
929  BGZF *fp;
930  uint32_t x[8];
931  if ((fp = bgzf_open(fn, "r")) == 0) return NULL;
932  if (bgzf_read(fp, magic, 4) != 4) goto tbi_fail;
933  if (memcmp(magic, "TBI\1", 4) != 0) goto tbi_fail;
934  if (bgzf_read(fp, x, 32) != 32) goto tbi_fail;
935  if (is_be) for (i = 0; i < 8; ++i) ed_swap_4p(&x[i]);
936  if ((idx = hts_idx_init(x[0], fmt, 0, 14, 5)) == NULL) goto tbi_fail;
937  idx->l_meta = 28 + x[7];
938  if ((idx->meta = (uint8_t*)malloc(idx->l_meta)) == NULL) goto tbi_fail;
939  memcpy(idx->meta, &x[1], 28);
940  if (bgzf_read(fp, idx->meta + 28, x[7]) != x[7]) goto tbi_fail;
941  if (hts_idx_load_core(idx, fp, HTS_FMT_TBI) < 0) goto tbi_fail;
942  bgzf_close(fp);
943  return idx;
944 
945  tbi_fail:
946  bgzf_close(fp);
947  hts_idx_destroy(idx);
948  return NULL;
949 
950  } else if (fmt == HTS_FMT_BAI) {
951  uint32_t n;
952  FILE *fp;
953  if ((fp = fopen(fn, "rb")) == 0) return NULL;
954  if (fread(magic, 1, 4, fp) != 4) goto bai_fail;
955  if (memcmp(magic, "BAI\1", 4) != 0) goto bai_fail;
956  if (fread(&n, 4, 1, fp) != 1) goto bai_fail;
957  if (is_be) ed_swap_4p(&n);
958  idx = hts_idx_init(n, fmt, 0, 14, 5);
959  if (hts_idx_load_core(idx, fp, HTS_FMT_BAI) < 0) goto bai_fail;
960  fclose(fp);
961  return idx;
962 
963  bai_fail:
964  fclose(fp);
965  hts_idx_destroy(idx);
966  return NULL;
967 
968  } else abort();
969 }
970 
971 void hts_idx_set_meta(hts_idx_t *idx, int l_meta, uint8_t *meta, int is_copy)
972 {
973  if (idx->meta) free(idx->meta);
974  idx->l_meta = l_meta;
975  if (is_copy) {
976  idx->meta = (uint8_t*)malloc(l_meta);
977  memcpy(idx->meta, meta, l_meta);
978  } else idx->meta = meta;
979 }
980 
981 uint8_t *hts_idx_get_meta(hts_idx_t *idx, int *l_meta)
982 {
983  *l_meta = idx->l_meta;
984  return idx->meta;
985 }
986 
987 const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr)
988 {
989  if ( !idx->n )
990  {
991  *n = 0;
992  return NULL;
993  }
994 
995  int tid = 0, i;
996  const char **names = (const char**) calloc(idx->n,sizeof(const char*));
997  for (i=0; i<idx->n; i++)
998  {
999  bidx_t *bidx = idx->bidx[i];
1000  if ( !bidx ) continue;
1001  names[tid++] = getid(hdr,i);
1002  }
1003  *n = tid;
1004  return names;
1005 }
1006 
1007 int hts_idx_get_stat(const hts_idx_t* idx, int tid, uint64_t* mapped, uint64_t* unmapped)
1008 {
1009  if ( idx->fmt == HTS_FMT_CRAI ) {
1010  *mapped = 0; *unmapped = 0;
1011  return -1;
1012  }
1013 
1014  bidx_t *h = idx->bidx[tid];
1015  khint_t k = kh_get(bin, h, META_BIN(idx));
1016  if (k != kh_end(h)) {
1017  *mapped = kh_val(h, k).list[1].u;
1018  *unmapped = kh_val(h, k).list[1].v;
1019  return 0;
1020  } else {
1021  *mapped = 0; *unmapped = 0;
1022  return -1;
1023  }
1024 }
1025 
1027 {
1028  return idx->n_no_coor;
1029 }
1030 
1031 /****************
1032  *** Iterator ***
1033  ****************/
1034 
1035 static inline int reg2bins(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls)
1036 {
1037  int l, t, s = min_shift + (n_lvls<<1) + n_lvls;
1038  if (beg >= end) return 0;
1039  if (end >= 1LL<<s) end = 1LL<<s;
1040  for (--end, l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
1041  int b, e, n, i;
1042  b = t + (beg>>s); e = t + (end>>s); n = e - b + 1;
1043  if (itr->bins.n + n > itr->bins.m) {
1044  itr->bins.m = itr->bins.n + n;
1045  kroundup32(itr->bins.m);
1046  itr->bins.a = (int*)realloc(itr->bins.a, sizeof(int) * itr->bins.m);
1047  }
1048  for (i = b; i <= e; ++i) itr->bins.a[itr->bins.n++] = i;
1049  }
1050  return itr->bins.n;
1051 }
1052 
1053 hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec)
1054 {
1055  int i, n_off, l, bin;
1056  hts_pair64_t *off;
1057  khint_t k;
1058  bidx_t *bidx;
1059  uint64_t min_off;
1060  hts_itr_t *iter = 0;
1061  if (tid < 0) {
1062  int finished0 = 0;
1063  uint64_t off0 = (uint64_t)-1;
1064  khint_t k;
1065  switch (tid) {
1066  case HTS_IDX_START:
1067  // Find the smallest offset, note that sequence ids may not be ordered sequentially
1068  for (i=0; i<idx->n; i++)
1069  {
1070  bidx = idx->bidx[i];
1071  k = kh_get(bin, bidx, META_BIN(idx));
1072  if (k == kh_end(bidx)) continue;
1073  if ( off0 > kh_val(bidx, k).list[0].u ) off0 = kh_val(bidx, k).list[0].u;
1074  }
1075  if ( off0==(uint64_t)-1 && idx->n_no_coor ) off0 = 0; // only no-coor reads in this bam
1076  break;
1077 
1078  case HTS_IDX_NOCOOR:
1079  if ( idx->n>0 )
1080  {
1081  bidx = idx->bidx[idx->n - 1];
1082  k = kh_get(bin, bidx, META_BIN(idx));
1083  if (k != kh_end(bidx)) off0 = kh_val(bidx, k).list[0].v;
1084  }
1085  if ( off0==(uint64_t)-1 && idx->n_no_coor ) off0 = 0; // only no-coor reads in this bam
1086  break;
1087 
1088  case HTS_IDX_REST:
1089  off0 = 0;
1090  break;
1091 
1092  case HTS_IDX_NONE:
1093  finished0 = 1;
1094  off0 = 0;
1095  break;
1096 
1097  default:
1098  return 0;
1099  }
1100  if (off0 != (uint64_t)-1) {
1101  iter = (hts_itr_t*)calloc(1, sizeof(hts_itr_t));
1102  iter->read_rest = 1;
1103  iter->finished = finished0;
1104  iter->curr_off = off0;
1105  iter->readrec = readrec;
1106  return iter;
1107  } else return 0;
1108  }
1109 
1110  if (beg < 0) beg = 0;
1111  if (end < beg) return 0;
1112  if ((bidx = idx->bidx[tid]) == 0) return 0;
1113 
1114  iter = (hts_itr_t*)calloc(1, sizeof(hts_itr_t));
1115  iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1;
1116  iter->readrec = readrec;
1117 
1118  // compute min_off
1119  bin = hts_bin_first(idx->n_lvls) + (beg>>idx->min_shift);
1120  do {
1121  int first;
1122  k = kh_get(bin, bidx, bin);
1123  if (k != kh_end(bidx)) break;
1124  first = (hts_bin_parent(bin)<<3) + 1;
1125  if (bin > first) --bin;
1126  else bin = hts_bin_parent(bin);
1127  } while (bin);
1128  if (bin == 0) k = kh_get(bin, bidx, bin);
1129  min_off = k != kh_end(bidx)? kh_val(bidx, k).loff : 0;
1130  // retrieve bins
1131  reg2bins(beg, end, iter, idx->min_shift, idx->n_lvls);
1132  for (i = n_off = 0; i < iter->bins.n; ++i)
1133  if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx))
1134  n_off += kh_value(bidx, k).n;
1135  if (n_off == 0) return iter;
1136  off = (hts_pair64_t*)calloc(n_off, 16);
1137  for (i = n_off = 0; i < iter->bins.n; ++i) {
1138  if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx)) {
1139  int j;
1140  bins_t *p = &kh_value(bidx, k);
1141  for (j = 0; j < p->n; ++j)
1142  if (p->list[j].v > min_off) off[n_off++] = p->list[j];
1143  }
1144  }
1145  if (n_off == 0) {
1146  free(off); return iter;
1147  }
1148  ks_introsort(_off, n_off, off);
1149  // resolve completely contained adjacent blocks
1150  for (i = 1, l = 0; i < n_off; ++i)
1151  if (off[l].v < off[i].v) off[++l] = off[i];
1152  n_off = l + 1;
1153  // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
1154  for (i = 1; i < n_off; ++i)
1155  if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
1156  // merge adjacent blocks
1157  for (i = 1, l = 0; i < n_off; ++i) {
1158  if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
1159  else off[++l] = off[i];
1160  }
1161  n_off = l + 1;
1162  iter->n_off = n_off; iter->off = off;
1163  return iter;
1164 }
1165 
1167 {
1168  if (iter) { free(iter->off); free(iter->bins.a); free(iter); }
1169 }
1170 
1171 const char *hts_parse_reg(const char *s, int *beg, int *end)
1172 {
1173  int i, k, l, name_end;
1174  *beg = *end = -1;
1175  name_end = l = strlen(s);
1176  // determine the sequence name
1177  for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end
1178  if (i >= 0) name_end = i;
1179  if (name_end < l) { // check if this is really the end
1180  int n_hyphen = 0;
1181  for (i = name_end + 1; i < l; ++i) {
1182  if (s[i] == '-') ++n_hyphen;
1183  else if (!isdigit(s[i]) && s[i] != ',') break;
1184  }
1185  if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name
1186  }
1187  // parse the interval
1188  if (name_end < l) {
1189  char *tmp;
1190  tmp = (char*)alloca(l - name_end + 1);
1191  for (i = name_end + 1, k = 0; i < l; ++i)
1192  if (s[i] != ',') tmp[k++] = s[i];
1193  tmp[k] = 0;
1194  if ((*beg = strtol(tmp, &tmp, 10) - 1) < 0) *beg = 0;
1195  *end = *tmp? strtol(tmp + 1, &tmp, 10) : 1<<29;
1196  if (*beg > *end) name_end = l;
1197  }
1198  if (name_end == l) *beg = 0, *end = 1<<29;
1199  return s + name_end;
1200 }
1201 
1202 hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec)
1203 {
1204  int tid, beg, end;
1205  char *q, *tmp;
1206  if (strcmp(reg, ".") == 0)
1207  return itr_query(idx, HTS_IDX_START, 0, 1<<29, readrec);
1208  else if (strcmp(reg, "*") != 0) {
1209  q = (char*)hts_parse_reg(reg, &beg, &end);
1210  tmp = (char*)alloca(q - reg + 1);
1211  strncpy(tmp, reg, q - reg);
1212  tmp[q - reg] = 0;
1213  if ((tid = getid(hdr, tmp)) < 0)
1214  tid = getid(hdr, reg);
1215  if (tid < 0) return 0;
1216  return itr_query(idx, tid, beg, end, readrec);
1217  } else return itr_query(idx, HTS_IDX_NOCOOR, 0, 0, readrec);
1218 }
1219 
1220 int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data)
1221 {
1222  int ret, tid, beg, end;
1223  if (iter == NULL || iter->finished) return -1;
1224  if (iter->read_rest) {
1225  if (iter->curr_off) { // seek to the start
1226  bgzf_seek(fp, iter->curr_off, SEEK_SET);
1227  iter->curr_off = 0; // only seek once
1228  }
1229  ret = iter->readrec(fp, data, r, &tid, &beg, &end);
1230  if (ret < 0) iter->finished = 1;
1231  return ret;
1232  }
1233  if (iter->off == 0) return -1;
1234  for (;;) {
1235  if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
1236  if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks
1237  if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
1238  bgzf_seek(fp, iter->off[iter->i+1].u, SEEK_SET);
1239  iter->curr_off = bgzf_tell(fp);
1240  }
1241  ++iter->i;
1242  }
1243  if ((ret = iter->readrec(fp, data, r, &tid, &beg, &end)) >= 0) {
1244  iter->curr_off = bgzf_tell(fp);
1245  if (tid != iter->tid || beg >= iter->end) { // no need to proceed
1246  ret = -1; break;
1247  } else if (end > iter->beg && iter->end > beg) return ret;
1248  } else break; // end of file or error
1249  }
1250  iter->finished = 1;
1251  return ret;
1252 }
1253 
1254 /**********************
1255  *** Retrieve index ***
1256  **********************/
1257 
1258 static char *test_and_fetch(const char *fn)
1259 {
1260  FILE *fp;
1261  // FIXME Use is_remote_scheme() helper that's true for ftp/http/irods/etc
1262  if (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn) {
1263  const int buf_size = 1 * 1024 * 1024;
1264  hFILE *fp_remote;
1265  uint8_t *buf;
1266  int l;
1267  const char *p;
1268  for (p = fn + strlen(fn) - 1; p >= fn; --p)
1269  if (*p == '/') break;
1270  ++p; // p now points to the local file name
1271  if ((fp_remote = hopen(fn, "r")) == 0) {
1272  if (hts_verbose >= 1) fprintf(stderr, "[E::%s] fail to open remote file '%s'\n", __func__, fn);
1273  return 0;
1274  }
1275  if ((fp = fopen(p, "w")) == 0) {
1276  if (hts_verbose >= 1) fprintf(stderr, "[E::%s] fail to create file '%s' in the working directory\n", __func__, p);
1277  hclose_abruptly(fp_remote);
1278  return 0;
1279  }
1280  if (hts_verbose >= 3) fprintf(stderr, "[M::%s] downloading file '%s' to local directory\n", __func__, fn);
1281  buf = (uint8_t*)calloc(buf_size, 1);
1282  while ((l = hread(fp_remote, buf, buf_size)) > 0) fwrite(buf, 1, l, fp);
1283  free(buf);
1284  fclose(fp);
1285  if (hclose(fp_remote) != 0) fprintf(stderr, "[E::%s] fail to close remote file '%s'\n", __func__, fn);
1286  return (char*)p;
1287  } else {
1288  if ((fp = fopen(fn, "rb")) == 0) return 0;
1289  fclose(fp);
1290  return (char*)fn;
1291  }
1292 }
1293 
1294 char *hts_idx_getfn(const char *fn, const char *ext)
1295 {
1296  int i, l_fn, l_ext;
1297  char *fnidx, *ret;
1298  l_fn = strlen(fn); l_ext = strlen(ext);
1299  fnidx = (char*)calloc(l_fn + l_ext + 1, 1);
1300  strcpy(fnidx, fn); strcpy(fnidx + l_fn, ext);
1301  if ((ret = test_and_fetch(fnidx)) == 0) {
1302  for (i = l_fn - 1; i > 0; --i)
1303  if (fnidx[i] == '.') break;
1304  strcpy(fnidx + i, ext);
1305  ret = test_and_fetch(fnidx);
1306  }
1307  if (ret == 0) {
1308  free(fnidx);
1309  return 0;
1310  }
1311  l_fn = strlen(ret);
1312  memmove(fnidx, ret, l_fn + 1);
1313  return fnidx;
1314 }
1315 
1316 hts_idx_t *hts_idx_load(const char *fn, int fmt)
1317 {
1318  char *fnidx;
1319  hts_idx_t *idx;
1320  fnidx = hts_idx_getfn(fn, ".csi");
1321  if (fnidx) fmt = HTS_FMT_CSI;
1322  else fnidx = hts_idx_getfn(fn, fmt == HTS_FMT_BAI? ".bai" : ".tbi");
1323  if (fnidx == 0) return 0;
1324 
1325  // Check that the index file is up to date, the main file might have changed
1326  struct stat stat_idx,stat_main;
1327  if ( !stat(fn, &stat_main) && !stat(fnidx, &stat_idx) )
1328  {
1329  if ( stat_idx.st_mtime < stat_main.st_mtime )
1330  fprintf(stderr, "Warning: The index file is older than the data file: %s\n", fnidx);
1331  }
1332  idx = hts_idx_load_local(fnidx, fmt);
1333  free(fnidx);
1334  return idx;
1335 }