35 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
36 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
37 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
38 1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 ,15,15,
39 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
40 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15,
41 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
42 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15,
44 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
45 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
46 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
47 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
48 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
49 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
50 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
51 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
62 static size_t decompress_peek(
hFILE *fp,
unsigned char *dest,
size_t destsize)
67 unsigned char buffer[512];
69 ssize_t npeek =
hpeek(fp, buffer,
sizeof buffer);
71 if (npeek < 0)
return 0;
92 static int is_binary(
unsigned char *s,
size_t n)
95 for (i = 0; i < n; i++)
96 if (s[i] < 0x07 || (s[i] >= 0x0e && s[i] < 0x20))
return 1;
104 if (hfile == NULL)
goto error;
107 if (fp == NULL)
goto error;
110 fp->
is_be = ed_is_big();
112 if (strchr(mode,
'r')) {
114 if (
hpeek(hfile, s, 6) == 6 && memcmp(s,
"CRAM", 4) == 0 &&
115 s[4] >= 1 && s[4] <= 2 && s[5] <= 1) {
118 else if (
hpeek(hfile, s, 18) == 18 && s[0] == 0x1f && s[1] == 0x8b &&
119 (s[3] & 4) && memcmp(&s[12],
"BC\2\0", 4) == 0) {
125 if (is_binary(s, decompress_peek(hfile, s, 4))) fp->
is_bin = 1;
128 else if (
hpeek(hfile, s, 2) == 2 && s[0] == 0x1f && s[1] == 0x8b) {
133 else if (
hpeek(hfile, s, 4) == 4 && is_binary(s, 4)) {
141 else if (strchr(mode,
'w') || strchr(mode,
'a')) {
143 if (strchr(mode,
'b')) fp->
is_bin = 1;
144 if (strchr(mode,
'c')) fp->
is_cram = 1;
153 if (fp->
fp.
bgzf == NULL)
goto error;
157 if (fp->
fp.
cram == NULL)
goto error;
170 if (gzfp) fp->
fp.
voidp = ks_init(gzfp);
181 fprintf(stderr,
"[E::%s] fail to open file '%s'\n", __func__, fn);
204 fprintf(stderr,
"[E::%s] Failed to decode sequence.\n", __func__);
207 fprintf(stderr,
"[W::%s] EOF marker is absent. The input is probably truncated.\n", __func__);
222 ks_destroy((kstream_t*)fp->
fp.
voidp);
249 fp->
fn_aux = strdup(fn_aux);
250 if (fp->
fn_aux == NULL)
return -1;
264 return ((kstream_t*)fp->
fp.
voidp)->f;
273 ((kstream_t*)fp->
fp.
voidp)->seek_pos = uoffset;
282 return ((kstream_t*)fp->
fp.
voidp)->seek_pos;
288 ret = ks_getuntil((kstream_t*)fp->
fp.
voidp, delimiter, str, &dret);
295 int m = 0, n = 0, dret;
304 if ( !fp )
return NULL;
308 str.
s = 0; str.
l = str.
m = 0;
310 while (ks_getuntil(ks,
KS_SEP_LINE, &str, &dret) >= 0)
312 if (str.
l == 0)
continue;
315 s[n-1] = strdup(str.
s);
327 const char *q = string, *p = string;
330 if (*p ==
',' || *p == 0)
334 s[n-1] = (
char*)calloc(p - q + 1, 1);
335 strncpy(s[n-1], q, p - q);
342 s = (
char**)realloc(s, n *
sizeof(
char*));
349 int m = 0, n = 0, dret;
359 str.
s = 0; str.
l = str.
m = 0;
361 while (ks_getuntil(ks,
KS_SEP_LINE, &str, &dret) >= 0) {
362 if (str.
l == 0)
continue;
365 s = (
char**)realloc(s, m *
sizeof(
char*));
367 s[n++] = strdup(str.
s);
375 s = (
char**)realloc(s, n *
sizeof(
char*));
377 }
else if (*fn ==
':') {
379 for (q = p = fn + 1;; ++p)
380 if (*p ==
',' || *p == 0) {
383 s = (
char**)realloc(s, m *
sizeof(
char*));
385 s[n] = (
char*)calloc(p - q + 1, 1);
386 strncpy(s[n++], q, p - q);
391 s = (
char**)realloc(s, n *
sizeof(
char*));
398 int len = strlen(fname);
399 if ( !strcasecmp(
".vcf.gz",fname+len-7) )
return FT_VCF_GZ;
400 if ( !strcasecmp(
".vcf",fname+len-4) )
return FT_VCF;
401 if ( !strcasecmp(
".bcf",fname+len-4) )
return FT_BCF_GZ;
402 if ( !strcmp(
"-",fname) )
return FT_STDIN;
405 int fd = open(fname, O_RDONLY);
409 if (
read(fd,magic,2)!=2 ) {
close(fd);
return 0; }
410 if ( !strncmp((
char*)magic,
"##",2) ) {
close(fd);
return FT_VCF; }
411 if ( !strncmp((
char*)magic,
"BCF",3) ) {
close(fd);
return FT_BCF; }
414 if ( magic[0]==0x1f && magic[1]==0x8b )
420 if ( !strncmp((
char*)magic,
"##",2) )
return FT_VCF;
421 if ( !strncmp((
char*)magic,
"BCF",3) )
return FT_BCF_GZ;
430 #define HTS_MIN_MARKER_DIST 0x10000
434 #define META_BIN(idx) ((idx)->n_bins + 1)
436 #define pair64_lt(a,b) ((a).u < (b).u)
473 static inline void insert_to_b(bidx_t *b,
int bin,
uint64_t beg,
uint64_t end)
478 k =
kh_put(bin, b, bin, &absent);
495 beg = _beg >> min_shift;
496 end = (_end - 1) >> min_shift;
497 if (l->m < end + 1) {
501 l->offset = (
uint64_t*)realloc(l->offset, l->m * 8);
502 memset(l->offset + old_m, 0xff, 8 * (l->m - old_m));
505 if (l->offset[beg] == (
uint64_t)-1) l->offset[beg] = offset;
507 for (i = beg; i <= end; ++i)
508 if (l->offset[i] == (
uint64_t)-1) l->offset[i] = offset;
510 if (l->n < end + 1) l->n = end + 1;
517 if (idx == NULL)
return NULL;
521 idx->
n_bins = ((1<<(3 * n_lvls + 3)) - 1) / 7;
527 idx->
bidx = (bidx_t**)calloc(n,
sizeof(bidx_t*));
528 if (idx->
bidx == NULL) { free(idx);
return NULL; }
530 if (idx->
lidx == NULL) { free(idx->
bidx); free(idx);
return NULL; }
535 static void update_loff(
hts_idx_t *idx,
int i,
int free_lidx)
537 bidx_t *bidx = idx->
bidx[i];
545 offset0 =
kh_val(bidx, k).list[0].u;
546 for (l = 0; l < lidx->n && lidx->offset[l] == (
uint64_t)-1; ++l)
547 lidx->offset[l] = offset0;
549 for (; l < lidx->n; ++l)
550 if (lidx->offset[l] == (
uint64_t)-1)
551 lidx->offset[l] = lidx->offset[l-1];
552 if (bidx == 0)
return;
558 int bot_bin = hts_bin_bot(
kh_key(bidx, k), idx->
n_lvls);
560 kh_val(bidx, k).loff = bot_bin < lidx->n ? lidx->offset[bot_bin] : 0;
567 lidx->m = lidx->n = 0;
572 static void compress_binning(
hts_idx_t *idx,
int i)
574 bidx_t *bidx = idx->
bidx[i];
577 if (bidx == 0)
return;
579 for (l = idx->
n_lvls; l > 0; --l) {
589 if (kp ==
kh_end(bidx))
continue;
591 if (q->
n + p->
n > q->
m) {
610 for (l = 1, m = 0; l < p->
n; ++l) {
622 if (idx == NULL || idx->
z.
finished)
return;
628 for (i = 0; i < idx->
n; ++i) {
630 compress_binning(idx, i);
640 idx->
m = idx->
m? idx->
m<<1 : 2;
641 idx->
bidx = (bidx_t**)realloc(idx->
bidx, idx->
m *
sizeof(bidx_t*));
643 memset(&idx->
bidx[oldm], 0, (idx->
m - oldm) *
sizeof(bidx_t*));
644 memset(&idx->
lidx[oldm], 0, (idx->
m - oldm) *
sizeof(
lidx_t));
646 if (idx->
n < tid + 1) idx->
n = tid + 1;
651 if (
hts_verbose >= 1) fprintf(stderr,
"[E::%s] NO_COOR reads not in a single block at the end %d %d\n", __func__, tid,idx->
z.
last_tid);
654 if (tid>=0 && idx->
bidx[tid] != 0)
656 if (
hts_verbose >= 1) fprintf(stderr,
"[E::%s] chromosome blocks not continuous\n", __func__);
661 }
else if (tid >= 0 && idx->
z.
last_coor > beg) {
662 if (
hts_verbose >= 1) fprintf(stderr,
"[E::%s] unsorted positions\n", __func__);
702 if (idx == 0)
return;
706 for (i = 0; i < idx->
m; ++i) {
707 bidx_t *bidx = idx->
bidx[i];
708 free(idx->
lidx[i].offset);
709 if (bidx == 0)
continue;
719 static inline long idx_read(
int is_bgzf,
void *fp,
void *buf,
long l)
722 else return (
long)fread(buf, 1, l, (FILE*)fp);
725 static inline long idx_write(
int is_bgzf,
void *fp,
const void *buf,
long l)
728 else return (
long)fwrite(buf, 1, l, (FILE*)fp);
731 static inline void swap_bins(
bins_t *p)
734 for (i = 0; i < p->
n; ++i) {
735 ed_swap_8p(&p->
list[i].
u);
736 ed_swap_8p(&p->
list[i].
v);
740 static void hts_idx_save_core(
const hts_idx_t *idx,
void *fp,
int fmt)
747 idx_write(is_bgzf, fp, ed_swap_4p(&x), 4);
748 }
else idx_write(is_bgzf, fp, &idx->
n, 4);
750 for (i = 0; i < idx->
n; ++i) {
752 bidx_t *bidx = idx->
bidx[i];
755 size = bidx?
kh_size(bidx) : 0;
758 idx_write(is_bgzf, fp, ed_swap_4p(&x), 4);
759 }
else idx_write(is_bgzf, fp, &size, 4);
760 if (bidx == 0)
goto write_lidx;
767 x =
kh_key(bidx, k); idx_write(is_bgzf, fp, ed_swap_4p(&x), 4);
770 idx_write(is_bgzf, fp, ed_swap_4p(&y), 8);
772 x = p->
n; idx_write(is_bgzf, fp, ed_swap_4p(&x), 4);
774 idx_write(is_bgzf, fp, p->
list, 16 * p->
n);
777 idx_write(is_bgzf, fp, &
kh_key(bidx, k), 4);
780 idx_write(is_bgzf, fp, &p->
n, 4);
781 idx_write(is_bgzf, fp, p->
list, p->
n << 4);
788 idx_write(is_bgzf, fp, ed_swap_4p(&x), 4);
789 for (x = 0; x < lidx->n; ++x) ed_swap_8p(&lidx->offset[x]);
790 idx_write(is_bgzf, fp, lidx->offset, lidx->n << 3);
791 for (x = 0; x < lidx->n; ++x) ed_swap_8p(&lidx->offset[x]);
793 idx_write(is_bgzf, fp, &lidx->n, 4);
794 idx_write(is_bgzf, fp, lidx->offset, lidx->n << 3);
800 idx_write(is_bgzf, fp, &x, 8);
801 }
else idx_write(is_bgzf, fp, &idx->
n_no_coor, 8);
807 fnidx = (
char*)calloc(1, strlen(fn) + 5);
814 fp =
bgzf_open(strcat(fnidx,
".csi"),
"w");
818 for (i = 0; i < 3; ++i)
826 fp =
bgzf_open(strcat(fnidx,
".tbi"),
"w");
832 fp = fopen(strcat(fnidx,
".bai"),
"w");
833 fwrite(
"BAI\1", 1, 4, fp);
840 static int hts_idx_load_core(
hts_idx_t *idx,
void *fp,
int fmt)
845 if (idx == NULL)
return -4;
846 for (i = 0; i < idx->
n; ++i) {
853 if (idx_read(is_bgzf, fp, &n, 4) != 4)
return -1;
854 if (is_be) ed_swap_4p(&n);
855 for (j = 0; j < n; ++j) {
857 if (idx_read(is_bgzf, fp, &key, 4) != 4)
return -1;
858 if (is_be) ed_swap_4p(&key);
859 k =
kh_put(bin, h, key, &absent);
860 if (absent <= 0)
return -3;
863 if (idx_read(is_bgzf, fp, &p->
loff, 8) != 8)
return -1;
864 if (is_be) ed_swap_8p(&p->
loff);
866 if (idx_read(is_bgzf, fp, &p->
n, 4) != 4)
return -1;
867 if (is_be) ed_swap_4p(&p->
n);
870 if (p->
list == NULL)
return -2;
871 if (idx_read(is_bgzf, fp, p->
list, p->
n<<4) != p->
n<<4)
return -1;
872 if (is_be) swap_bins(p);
876 if (idx_read(is_bgzf, fp, &l->n, 4) != 4)
return -1;
877 if (is_be) ed_swap_4p(&l->n);
879 l->offset = (
uint64_t*)malloc(l->n << 3);
880 if (l->offset == NULL)
return -2;
881 if (idx_read(is_bgzf, fp, l->offset, l->n << 3) != l->n << 3)
return -1;
882 if (is_be)
for (j = 0; j < l->n; ++j) ed_swap_8p(&l->offset[j]);
883 for (j = 1; j < l->n; ++j)
884 if (l->offset[j] == 0) l->offset[j] = l->offset[j-1];
885 update_loff(idx, i, 1);
903 if ((fp =
bgzf_open(fn,
"r")) == 0)
return NULL;
904 if (
bgzf_read(fp, magic, 4) != 4)
goto csi_fail;
905 if (memcmp(magic,
"CSI\1", 4) != 0)
goto csi_fail;
906 if (
bgzf_read(fp, x, 12) != 12)
goto csi_fail;
907 if (is_be)
for (i = 0; i < 3; ++i) ed_swap_4p(&x[i]);
909 if ((meta = (
uint8_t*)malloc(x[2])) == NULL)
goto csi_fail;
910 if (
bgzf_read(fp, meta, x[2]) != x[2])
goto csi_fail;
912 if (
bgzf_read(fp, &n, 4) != 4)
goto csi_fail;
913 if (is_be) ed_swap_4p(&n);
914 if ((idx =
hts_idx_init(n, fmt, 0, x[0], x[1])) == NULL)
goto csi_fail;
918 if (hts_idx_load_core(idx, fp,
HTS_FMT_CSI) < 0)
goto csi_fail;
931 if ((fp =
bgzf_open(fn,
"r")) == 0)
return NULL;
932 if (
bgzf_read(fp, magic, 4) != 4)
goto tbi_fail;
933 if (memcmp(magic,
"TBI\1", 4) != 0)
goto tbi_fail;
934 if (
bgzf_read(fp, x, 32) != 32)
goto tbi_fail;
935 if (is_be)
for (i = 0; i < 8; ++i) ed_swap_4p(&x[i]);
936 if ((idx =
hts_idx_init(x[0], fmt, 0, 14, 5)) == NULL)
goto tbi_fail;
939 memcpy(idx->
meta, &x[1], 28);
940 if (
bgzf_read(fp, idx->
meta + 28, x[7]) != x[7])
goto tbi_fail;
941 if (hts_idx_load_core(idx, fp,
HTS_FMT_TBI) < 0)
goto tbi_fail;
953 if ((fp = fopen(fn,
"rb")) == 0)
return NULL;
954 if (fread(magic, 1, 4, fp) != 4)
goto bai_fail;
955 if (memcmp(magic,
"BAI\1", 4) != 0)
goto bai_fail;
956 if (fread(&n, 4, 1, fp) != 1)
goto bai_fail;
957 if (is_be) ed_swap_4p(&n);
959 if (hts_idx_load_core(idx, fp,
HTS_FMT_BAI) < 0)
goto bai_fail;
977 memcpy(idx->
meta, meta, l_meta);
978 }
else idx->
meta = meta;
996 const char **names = (
const char**) calloc(idx->
n,
sizeof(
const char*));
997 for (i=0; i<idx->
n; i++)
999 bidx_t *bidx = idx->
bidx[i];
1000 if ( !bidx )
continue;
1001 names[tid++] = getid(hdr,i);
1010 *mapped = 0; *unmapped = 0;
1014 bidx_t *h = idx->
bidx[tid];
1017 *mapped =
kh_val(h, k).list[1].u;
1018 *unmapped =
kh_val(h, k).list[1].v;
1021 *mapped = 0; *unmapped = 0;
1037 int l, t, s = min_shift + (n_lvls<<1) + n_lvls;
1038 if (beg >= end)
return 0;
1039 if (end >= 1LL<<s) end = 1LL<<s;
1040 for (--end, l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
1042 b = t + (beg>>s); e = t + (end>>s); n = e - b + 1;
1048 for (i = b; i <= e; ++i) itr->
bins.
a[itr->
bins.
n++] = i;
1055 int i, n_off, l, bin;
1068 for (i=0; i<idx->
n; i++)
1070 bidx = idx->
bidx[i];
1072 if (k ==
kh_end(bidx))
continue;
1073 if ( off0 >
kh_val(bidx, k).list[0].u ) off0 =
kh_val(bidx, k).list[0].u;
1081 bidx = idx->
bidx[idx->
n - 1];
1083 if (k !=
kh_end(bidx)) off0 =
kh_val(bidx, k).list[0].v;
1110 if (beg < 0) beg = 0;
1111 if (end < beg)
return 0;
1112 if ((bidx = idx->
bidx[tid]) == 0)
return 0;
1115 iter->
tid = tid, iter->
beg = beg, iter->
end = end; iter->
i = -1;
1122 k =
kh_get(bin, bidx, bin);
1123 if (k !=
kh_end(bidx))
break;
1125 if (bin > first) --bin;
1128 if (bin == 0) k =
kh_get(bin, bidx, bin);
1132 for (i = n_off = 0; i < iter->
bins.
n; ++i)
1135 if (n_off == 0)
return iter;
1137 for (i = n_off = 0; i < iter->
bins.
n; ++i) {
1141 for (j = 0; j < p->
n; ++j)
1142 if (p->
list[j].
v > min_off) off[n_off++] = p->
list[j];
1146 free(off);
return iter;
1150 for (i = 1, l = 0; i < n_off; ++i)
1151 if (off[l].v < off[i].v) off[++l] = off[i];
1154 for (i = 1; i < n_off; ++i)
1155 if (off[i-1].v >= off[i].u) off[i-1].
v = off[i].
u;
1157 for (i = 1, l = 0; i < n_off; ++i) {
1158 if (off[l].v>>16 == off[i].u>>16) off[l].
v = off[i].
v;
1159 else off[++l] = off[i];
1162 iter->
n_off = n_off; iter->
off = off;
1168 if (iter) { free(iter->
off); free(iter->
bins.
a); free(iter); }
1173 int i, k, l, name_end;
1175 name_end = l = strlen(s);
1177 for (i = l - 1; i >= 0; --i)
if (s[i] ==
':')
break;
1178 if (i >= 0) name_end = i;
1181 for (i = name_end + 1; i < l; ++i) {
1182 if (s[i] ==
'-') ++n_hyphen;
1183 else if (!isdigit(s[i]) && s[i] !=
',')
break;
1185 if (i < l || n_hyphen > 1) name_end = l;
1190 tmp = (
char*)alloca(l - name_end + 1);
1191 for (i = name_end + 1, k = 0; i < l; ++i)
1192 if (s[i] !=
',') tmp[k++] = s[i];
1194 if ((*beg = strtol(tmp, &tmp, 10) - 1) < 0) *beg = 0;
1195 *end = *tmp? strtol(tmp + 1, &tmp, 10) : 1<<29;
1196 if (*beg > *end) name_end = l;
1198 if (name_end == l) *beg = 0, *end = 1<<29;
1199 return s + name_end;
1206 if (strcmp(reg,
".") == 0)
1208 else if (strcmp(reg,
"*") != 0) {
1210 tmp = (
char*)alloca(q - reg + 1);
1211 strncpy(tmp, reg, q - reg);
1213 if ((tid = getid(hdr, tmp)) < 0)
1214 tid = getid(hdr, reg);
1215 if (tid < 0)
return 0;
1216 return itr_query(idx, tid, beg, end, readrec);
1222 int ret, tid, beg, end;
1223 if (iter == NULL || iter->
finished)
return -1;
1229 ret = iter->
readrec(fp, data, r, &tid, &beg, &end);
1233 if (iter->
off == 0)
return -1;
1236 if (iter->
i == iter->
n_off - 1) { ret = -1;
break; }
1237 if (iter->
i < 0 || iter->
off[iter->
i].
v != iter->
off[iter->
i+1].
u) {
1243 if ((ret = iter->
readrec(fp, data, r, &tid, &beg, &end)) >= 0) {
1245 if (tid != iter->
tid || beg >= iter->
end) {
1247 }
else if (end > iter->
beg && iter->
end > beg)
return ret;
1258 static char *test_and_fetch(
const char *fn)
1262 if (strstr(fn,
"ftp://") == fn || strstr(fn,
"http://") == fn) {
1263 const int buf_size = 1 * 1024 * 1024;
1268 for (p = fn + strlen(fn) - 1; p >= fn; --p)
1269 if (*p ==
'/')
break;
1271 if ((fp_remote =
hopen(fn,
"r")) == 0) {
1272 if (
hts_verbose >= 1) fprintf(stderr,
"[E::%s] fail to open remote file '%s'\n", __func__, fn);
1275 if ((fp = fopen(p,
"w")) == 0) {
1276 if (
hts_verbose >= 1) fprintf(stderr,
"[E::%s] fail to create file '%s' in the working directory\n", __func__, p);
1280 if (
hts_verbose >= 3) fprintf(stderr,
"[M::%s] downloading file '%s' to local directory\n", __func__, fn);
1281 buf = (
uint8_t*)calloc(buf_size, 1);
1282 while ((l =
hread(fp_remote, buf, buf_size)) > 0) fwrite(buf, 1, l, fp);
1285 if (
hclose(fp_remote) != 0) fprintf(stderr,
"[E::%s] fail to close remote file '%s'\n", __func__, fn);
1288 if ((fp = fopen(fn,
"rb")) == 0)
return 0;
1298 l_fn = strlen(fn); l_ext = strlen(ext);
1299 fnidx = (
char*)calloc(l_fn + l_ext + 1, 1);
1300 strcpy(fnidx, fn); strcpy(fnidx + l_fn, ext);
1301 if ((ret = test_and_fetch(fnidx)) == 0) {
1302 for (i = l_fn - 1; i > 0; --i)
1303 if (fnidx[i] ==
'.')
break;
1304 strcpy(fnidx + i, ext);
1305 ret = test_and_fetch(fnidx);
1312 memmove(fnidx, ret, l_fn + 1);
1323 if (fnidx == 0)
return 0;
1326 struct stat stat_idx,stat_main;
1327 if ( !stat(fn, &stat_main) && !stat(fnidx, &stat_idx) )
1329 if ( stat_idx.st_mtime < stat_main.st_mtime )
1330 fprintf(stderr,
"Warning: The index file is older than the data file: %s\n", fnidx);