18 #include "htslib/kseq.h"
21 uint32_t bcf_float_missing = 0x7F800001;
22 uint32_t bcf_float_vector_end = 0x7F800002;
23 uint8_t bcf_type_shift[] = { 0, 0, 1, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
24 static bcf_idinfo_t bcf_idinfo_def = { .
info = { 15, 15, 15 }, .hrec = { NULL, NULL, NULL}, .id = -1 };
41 while ( !*ss && isspace(*ss) ) ss++;
44 fprintf(stderr,
"[W::%s] Empty sample name: trailing spaces/tabs in the header line?\n", __func__);
50 char *sdup = strdup(s);
51 int k =
kh_put(vdict, d, sdup, &ret);
53 kh_val(d, k) = bcf_idinfo_def;
57 fprintf(stderr,
"[W::%s] Duplicated sample name '%s'. Skipped.\n", __func__, s);
72 for (p = q = str;; ++q) {
73 if (*q !=
'\t' && *q != 0 && *q !=
'\n')
continue;
75 char *s = (
char*)malloc(q - p + 1);
81 if (*q == 0 || *q ==
'\n')
break;
90 for (i = 0; i < 3; i++)
92 vdict_t *d = (vdict_t*)h->
dict[i];
102 if ( max_id >= h->
n[i] )
105 for (k=h->
n[i]; k<=max_id; k++)
107 h->
id[i][k].
key = NULL;
108 h->
id[i][k].
val = NULL;
127 for (i=0; i<hrec->
nkeys; i++)
142 if ( hrec->
key ) out->
key = strdup(hrec->
key);
145 out->
keys = (
char**) malloc(
sizeof(
char*)*hrec->
nkeys);
146 out->
vals = (
char**) malloc(
sizeof(
char*)*hrec->
nkeys);
148 for (i=0; i<hrec->
nkeys; i++)
150 if ( hrec->
keys[i] && !strcmp(
"IDX",hrec->
keys[i]) )
continue;
151 if ( hrec->
keys[i] ) out->
keys[j] = strdup(hrec->
keys[i]);
152 if ( hrec->
vals[i] ) out->
vals[j] = strdup(hrec->
vals[i]);
155 if ( i!=j ) out->
nkeys--;
161 fprintf(fp,
"key=[%s] value=[%s]", hrec->
key, hrec->
value?hrec->
value:
"");
163 for (i=0; i<hrec->
nkeys; i++)
164 fprintf(fp,
"\t[%s]=[%s]", hrec->
keys[i],hrec->
vals[i]);
171 for (i=0; i<hdr->
nhrec; i++)
175 fprintf(stderr,
"##%s=<", hdr->
hrec[i]->
key);
179 fprintf(stderr,
">\n");
188 int n = ++hrec->
nkeys;
189 hrec->
keys = (
char**) realloc(hrec->
keys,
sizeof(
char*)*n);
190 hrec->
vals = (
char**) realloc(hrec->
vals,
sizeof(
char*)*n);
192 hrec->
keys[n-1] = (
char*) malloc((len+1)*
sizeof(char));
193 memcpy(hrec->
keys[n-1],str,len);
194 hrec->
keys[n-1][len] = 0;
195 hrec->
vals[n-1] = NULL;
200 if ( !str ) { hrec->
vals[i] = NULL;
return; }
201 if ( hrec->
vals[i] ) free(hrec->
vals[i]);
204 hrec->
vals[i] = (
char*) malloc((len+3)*
sizeof(char));
205 hrec->
vals[i][0] =
'"';
206 memcpy(&hrec->
vals[i][1],str,len);
207 hrec->
vals[i][len+1] =
'"';
208 hrec->
vals[i][len+2] = 0;
212 hrec->
vals[i] = (
char*) malloc((len+1)*
sizeof(char));
213 memcpy(hrec->
vals[i],str,len);
214 hrec->
vals[i][len] = 0;
220 int n = ++hrec->
nkeys;
221 hrec->
keys = (
char**) realloc(hrec->
keys,
sizeof(
char*)*n);
222 hrec->
vals = (
char**) realloc(hrec->
vals,
sizeof(
char*)*n);
223 hrec->
keys[n-1] = strdup(
"IDX");
226 hrec->
vals[n-1] = str.
s;
232 for (i=0; i<hrec->
nkeys; i++)
233 if ( !strcasecmp(key,hrec->
keys[i]) )
return i;
237 static inline int is_escaped(
const char *
min,
const char *str)
240 while ( --str>=min && *str==
'\\' ) n++;
246 const char *p = line;
247 if (p[0] !=
'#' || p[1] !=
'#') { *len = 0;
return NULL; }
251 while ( *q && *q!=
'=' ) q++;
253 if ( *q!=
'=' || !n ) { *len = q-line+1;
return NULL; }
256 hrec->
key = (
char*) malloc(
sizeof(
char)*(n+1));
257 memcpy(hrec->
key,p,n);
263 while ( *q && *q!=
'\n' ) q++;
264 hrec->
value = (
char*) malloc((q-p+1)*
sizeof(char));
265 memcpy(hrec->
value, p, q-p);
266 hrec->
value[q-p] = 0;
273 while ( *q && *q!=
'\n' && nopen )
276 while ( *q && *q!=
'=' ) q++;
278 if ( *q!=
'=' || !n ) { *len = q-line+1;
bcf_hrec_destroy(hrec);
return NULL; }
281 int quoted = *p==
'"' ? 1 : 0;
282 if ( quoted ) p++, q++;
286 if ( quoted ) {
if ( *q==
'"' && !is_escaped(p,q) )
break; }
289 if ( *q==
'<' ) nopen++;
290 if ( *q==
'>' ) nopen--;
292 if ( *q==
',' && nopen==1 )
break;
298 if ( *q==
'>' ) { nopen--; q++; }
310 if ( !strcmp(hrec->
key,
"contig") )
317 if ( sscanf(hrec->
vals[i],
"%d",&j)!=1 )
return 0;
321 str = strdup(hrec->
vals[i]);
325 k =
kh_put(vdict, d, str, &ret);
326 if ( !ret ) { free(str);
return 0; }
331 char *tmp = hrec->
vals[idx];
332 idx = strtol(hrec->
vals[idx], &tmp, 10);
335 fprintf(stderr,
"[%s:%d %s] Error parsing the IDX tag, skipping.\n", __FILE__,__LINE__,__FUNCTION__);
345 kh_val(d, k) = bcf_idinfo_def;
348 kh_val(d, k).hrec[0] = hrec;
361 int type = -1, num = -1, var = -1, idx = -1;
362 for (i=0; i<hrec->
nkeys; i++)
364 if ( !strcmp(hrec->
keys[i],
"ID") )
id = hrec->
vals[i];
365 else if ( !strcmp(hrec->
keys[i],
"IDX") )
367 char *tmp = hrec->
vals[i];
368 idx = strtol(hrec->
vals[i], &tmp, 10);
371 fprintf(stderr,
"[%s:%d %s] Error parsing the IDX tag, skipping.\n", __FILE__,__LINE__,__FUNCTION__);
375 else if ( !strcmp(hrec->
keys[i],
"Type") )
383 fprintf(stderr,
"[E::%s] The type \"%s\" not supported, assuming \"String\"\n", __func__, hrec->
vals[i]);
387 else if ( !strcmp(hrec->
keys[i],
"Number") )
395 sscanf(hrec->
vals[i],
"%d",&num);
408 k =
kh_put(vdict, d, str, &ret);
413 if (
kh_val(d, k).hrec[info&0xf] )
return 0;
414 kh_val(d, k).info[info&0xf] = info;
415 kh_val(d, k).hrec[info&0xf] = hrec;
418 kh_val(d, k) = bcf_idinfo_def;
419 kh_val(d, k).info[info&0xf] = info;
420 kh_val(d, k).hrec[info&0xf] = hrec;
442 for (i=0; i<hdr->
nhrec; i++)
445 if ( !strcmp(hdr->
hrec[i]->
key,hrec->
key) && !strcmp(hrec->
key,
"fileformat") )
break;
456 int n = ++hdr->
nhrec;
458 hdr->
hrec[n-1] = hrec;
468 for (i=0; i<hdr->
nhrec; i++)
471 if ( !strcmp(hdr->
hrec[i]->
key,
id) )
return hdr->
hrec[i];
477 if ( k ==
kh_end(d) )
return NULL;
483 static int PL_warned = 0, GL_warned = 0;
490 fprintf(stderr,
"[W::%s] PL should be declared as Number=G\n", __func__);
499 fprintf(stderr,
"[W::%s] GL should be declared as Number=G\n", __func__);
507 int len, needs_sync = 0;
512 if ( !hrec->
key || strcasecmp(hrec->
key,
"fileformat") )
513 fprintf(stderr,
"[W::%s] The first line should be ##fileformat; is the VCF/BCF header broken?\n", __func__);
517 hrec =
bcf_hdr_parse_line(hdr,
"##FILTER=<ID=PASS,Description=\"All filters passed\">",&len);
536 if ( !hrec )
return -1;
553 for (i=0; i<hdr->
nhrec; i++)
554 if ( hdr->
hrec[i]==hrec )
break;
555 assert( i<hdr->nhrec );
563 for (i=0; i<hdr->
nhrec; i++)
565 if ( hdr->
hrec[i]->
type!=type )
continue;
566 if ( !strcmp(hdr->
hrec[i]->
key,key) )
break;
568 if ( i==hdr->
nhrec )
return;
573 if ( i < hdr->nhrec )
585 int n = vsnprintf(NULL, 0, fmt, ap) + 2;
588 char *line = (
char*)malloc(n);
590 vsnprintf(line, n, fmt, ap);
609 fprintf(stderr,
"No version string found, assuming VCFv4.2\n");
622 ksprintf(&str,
"##fileformat=%s", version);
629 hrec->
value = strdup(version);
639 for (i = 0; i < 3; ++i)
641 if ( strchr(mode,
'w') )
645 bcf_hdr_append(h,
"##FILTER=<ID=PASS,Description=\"All filters passed\">");
654 for (i = 0; i < 3; ++i) {
655 vdict_t *d = (vdict_t*)h->
dict[i];
656 if (d == 0)
continue;
662 for (i=0; i<h->
nhrec; i++)
683 fprintf(stderr,
"[%s:%d %s] Failed to read the header (reading BCF in text mode?)\n", __FILE__,__LINE__,__FUNCTION__);
686 if (strncmp((
char*)magic,
"BCF\2\2", 5) != 0)
688 if (!strncmp((
char*)magic,
"BCF", 3))
689 fprintf(stderr,
"[%s:%d %s] invalid BCF2 magic string: only BCFv2.2 is supported.\n", __FILE__,__LINE__,__FUNCTION__);
691 fprintf(stderr,
"[E::%s] invalid BCF2 magic string\n", __func__);
698 htxt = (
char*)malloc(hlen);
714 if (
bgzf_write(fp,
"BCF\2\2", 5) !=5 )
return -1;
715 if (
bgzf_write(fp, &hlen, 4) !=4 )
return -1;
716 if (
bgzf_write(fp, htxt, hlen) != hlen )
return -1;
744 for (i=0; i<v->
d.
m_fmt; i++)
781 static inline int bcf_read1_core(
BGZF *fp,
bcf1_t *v)
785 if ((ret =
bgzf_read(fp, x, 32)) != 32) {
786 if (ret == 0)
return -1;
791 ks_resize(&v->
shared, x[0]);
792 ks_resize(&v->
indiv, x[1]);
793 memcpy(v, x + 2, 16);
806 #define bit_array_size(n) ((n)/8+1)
807 #define bit_array_set(a,i) ((a)[(i)/8] |= 1 << ((i)%8))
808 #define bit_array_clear(a,i) ((a)[(i)/8] &= ~(1 << ((i)%8)))
809 #define bit_array_test(a,i) ((a)[(i)/8] & (1 << ((i)%8)))
825 for (i=0; i<dec->m_fmt; ++i) dec->fmt[i].p_free = 0;
829 ptr = bcf_unpack_fmt_core1(ptr, rec->
n_sample, &dec->fmt[i]);
830 src = dec->fmt[i].p - dec->fmt[i].size;
833 memmove(dec->fmt[i-1].p + dec->fmt[i-1].p_len, dec->fmt[i].p - dec->fmt[i].p_off, dec->fmt[i].p_off);
834 dec->fmt[i].p = dec->fmt[i-1].p + dec->fmt[i-1].p_len + dec->fmt[i].p_off;
839 src += dec->fmt[i].size;
841 memmove(dst, src, dec->fmt[i].size);
842 dst += dec->fmt[i].size;
844 rec->
indiv.
l -= dec->fmt[i].p_len - (dst - dec->fmt[i].p);
845 dec->fmt[i].p_len = dst - dec->fmt[i].p;
856 int ret = bcf_read1_core(fp->
fp.
bgzf, v);
865 if ((ret = bcf_read1_core(fp, v)) >= 0)
895 for (i=0; i<line->
n_info; i++)
901 if ( irm < 0 ) irm = i;
908 while ( irm<=i && line->d.info[irm].vptr ) irm++;
911 if ( irm>=0 ) line->
n_info = irm;
914 static int bcf1_sync(
bcf1_t *line)
916 char *shared_ori = line->
shared.
s;
924 bcf1_sync_id(line, &tmp);
927 bcf1_sync_alleles(line, &tmp);
930 bcf1_sync_filter(line, &tmp);
933 bcf1_sync_info(line, &tmp);
946 bcf1_sync_id(line, &tmp);
954 bcf1_sync_alleles(line, &tmp);
967 bcf1_sync_filter(line, &tmp);
980 bcf1_sync_info(line, &tmp);
986 int size = line->
shared.
l - (size_t)ptr_ori + (
size_t)line->
shared.
s;
987 if ( size ) kputsn_(ptr_ori, size, &tmp);
997 for (i=0; i<line->
n_info; i++)
1013 tmp.
l = tmp.
m = 0; tmp.
s = NULL;
1015 for (i=0; i<line->
n_fmt; i++)
1021 if ( irm < 0 ) irm = i;
1028 while ( irm<=i && line->d.fmt[irm].p ) irm++;
1032 if ( irm>=0 ) line->
n_fmt = irm;
1038 for (i=0; i<line->
n_fmt; i++)
1083 fprintf(stderr,
"[%s:%d %s] Broken VCF record, the number of columns at %s:%d does not match the number of samples (%d vs %d).\n",
1096 fprintf(stderr,
"[%s:%d %s] Unchecked error (%d), exiting.\n", __FILE__,__LINE__,__FUNCTION__,v->
errcode);
1105 memcpy(x + 2, v, 16);
1108 if (
bgzf_write(fp, x, 32) != 32 )
return -1;
1123 txt.
l = txt.
m = 0; txt.
s = 0;
1125 if (s->
l == 0)
continue;
1126 if (s->
s[0] !=
'#') {
1128 fprintf(stderr,
"[E::%s] no sample line\n", __func__);
1133 if (s->
s[1] !=
'#' && fp->
fn_aux) {
1138 tmp.
l = tmp.
m = 0; tmp.
s = 0;
1141 while (ks_getuntil(ks, 0, &tmp, &dret) >= 0) {
1143 kputs(
"##contig=<ID=", &txt); kputs(tmp.
s, &txt);
1144 ks_getuntil(ks, 0, &tmp, &dret);
1145 kputs(
",length=", &txt); kputw(atol(tmp.
s), &txt);
1146 kputsn(
">\n", 2, &txt);
1148 while ((c = ks_getc(ks)) !=
'\n' && c != -1);
1154 kputsn(s->
s, s->
l, &txt);
1156 if (s->
s[1] !=
'#')
break;
1160 fprintf(stderr,
"[%s:%d %s] Could not read the header\n", __FILE__,__LINE__,__FUNCTION__);
1169 int i, n, need_sync = 0;
1174 if ( hrec )
continue;
1176 hrec->
key = strdup(
"contig");
1197 if ( !lines )
return 1;
1198 for (i=0; i<n-1; i++)
1218 for (j=0; j<hrec->
nkeys; j++)
1221 if ( !is_bcf && !strcmp(
"IDX",hrec->
keys[j]) )
continue;
1222 if ( nout ) kputc(
',',str);
1234 _bcf_hrec_format(hrec,0,str);
1240 for (i=0; i<hdr->
nhrec; i++)
1241 _bcf_hrec_format(hdr->
hrec[i], is_bcf, &txt);
1243 ksprintf(&txt,
"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO");
1252 if ( len ) *len = txt.
l;
1260 const char **names = (
const char**) calloc(m,
sizeof(
const char*));
1267 names[tid] =
kh_key(d,k);
1270 for (tid=0; tid<m; tid++)
1280 while (hlen && htxt[hlen-1] == 0) --hlen;
1287 return ret<0 ? -1 : 0;
1299 else if (n == 1) bcf_enc_int1(s, a[0]);
1301 if (wsize <= 0) wsize = n;
1302 for (i = 0; i < n; ++i) {
1304 if (max < a[i]) max = a[i];
1305 if (min > a[i]) min = a[i];
1309 for (i = 0; i < n; ++i)
1312 else kputc(a[i], s);
1315 for (i = 0; i < n; ++i)
1321 kputsn((
char*)&x, 2, s);
1325 for (i = 0; i < n; ++i) {
1327 kputsn((
char*)&x, 4, s);
1336 kputsn((
char*)a, n << 2, s);
1354 char *p = (
char*)data;
1355 for (j = 0; j < n && *p; ++j, ++p)
1363 #define BRANCH(type_t, is_missing, is_vector_end, kprint) { \
1364 type_t *p = (type_t *) data; \
1365 for (j=0; j<n; j++) \
1367 if ( is_vector_end ) break; \
1368 if ( j ) kputc(',', s); \
1369 if ( is_missing ) kputc('.', s); \
1378 default: fprintf(stderr,
"todo: type %d\n", type); exit(1);
break;
1387 x = bcf_dec_size(ptr, &ptr, &type);
1403 static inline void align_mem(
kstring_t *s)
1407 int l = ((s->
l + 7)>>3<<3) - s->
l;
1408 kputsn((
char*)&zero, l, s);
1426 for (r = p, v->
n_fmt = 1; *r; ++r)
1427 if (*r ==
':') ++v->
n_fmt;
1428 char *end = s->
s + s->
l;
1431 fprintf(stderr,
"[%s:%d %s] Error: FORMAT column with no sample columns starting at %s:%d\n", __FILE__,__LINE__,__FUNCTION__,s->
s,v->
pos+1);
1437 for (j = 0, t =
kstrtok(p,
":", &aux1); t; t =
kstrtok(0, 0, &aux1), ++j) {
1441 fprintf(stderr,
"[W::%s] FORMAT '%s' is not defined in the header, assuming Type=String\n", __func__, t);
1444 ksprintf(&tmp,
"##FORMAT=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", t);
1453 fmt[j].
is_gt = !strcmp(t,
"GT");
1457 int n_sample_ori = -1;
1468 while ( *r!=
'\t' && r<end ) r++;
1469 if ( *r==
'\t' ) { *r = 0; r++; }
1478 if ( *r ==
'\t' ) *r = 0;
1479 if ( *r ==
':' || !*r )
1481 if (fmt[j].max_m < m) fmt[j].
max_m = m;
1482 if (fmt[j].max_l < l - 1) fmt[j].
max_l = l - 1;
1483 if (fmt[j].is_gt && fmt[j].max_g < g) fmt[j].
max_g = g;
1488 else if ( *r==
',' ) m++;
1489 else if ( fmt[j].is_gt && (*r ==
'|' || *r ==
'/') ) g++;
1490 if ( r>=end )
break;
1499 for (j = 0; j < v->
n_fmt; ++j) {
1508 fprintf(stderr,
"[E::%s] the format type %d currently not supported\n", __func__, f->
y>>4&0xf);
1516 for (j = 0; j < v->
n_fmt; ++j)
1529 while ( *t && t<end ) t++;
1544 if (*t ==
'.') ++t, x[l++] = is_phased;
1545 else x[l++] = (strtol(t, &t, 10) + 1) << 1 | is_phased;
1546 #if THOROUGH_SANITY_CHECKS
1549 is_phased = (*t ==
'|');
1550 if (*t ==
':' || *t == 0)
break;
1552 if ( !l ) x[l++] = 0;
1555 char *x = (
char*)z->
buf + z->
size * m;
1556 for (r = t, l = 0; *t !=
':' && *t; ++t) x[l++] = *t;
1557 for (; l < z->
size; ++l) x[l] = 0;
1563 else x[l++] = strtol(t, &t, 10);
1564 if (*t ==
':' || *t == 0)
break;
1569 float *x = (
float*)(z->
buf + z->
size * m);
1572 else x[l++] = strtod(t, &t);
1573 if (*t ==
':' || *t == 0)
break;
1579 for (++j; j < v->
n_fmt; ++j) {
1587 char *x = (
char*)z->
buf + z->
size * m;
1588 if ( z->
size ) x[0] =
'.';
1589 for (l = 1; l < z->
size; ++l) x[l] = 0;
1596 float *x = (
float*)(z->
buf + z->
size * m);
1616 for (i = 0; i < v->
n_fmt; ++i) {
1618 bcf_enc_int1(str, z->
key);
1633 fprintf(stderr,
"[%s:%d %s] Number of columns at %s:%d does not match the number of samples (%d vs %d).\n",
1645 char *p, *q, *r, *t;
1653 for (p =
kstrtok(s->
s,
"\t", &aux), i = 0; p; p =
kstrtok(0, 0, &aux), ++i) {
1663 fprintf(stderr,
"[W::%s] contig '%s' is not defined in the header. (Quick workaround: index the file with tabix.)\n", __func__, p);
1666 ksprintf(&tmp,
"##contig=<ID=%s,length=2147483647>", p);
1674 }
else if (i == 1) {
1675 v->
pos = atoi(p) - 1;
1676 }
else if (i == 2) {
1679 }
else if (i == 3) {
1682 }
else if (i == 4) {
1683 if (strcmp(p,
".")) {
1684 for (r = t = p;; ++r) {
1685 if (*r ==
',' || *r == 0) {
1693 }
else if (i == 5) {
1694 if (strcmp(p,
".")) v->
qual = atof(p);
1697 }
else if (i == 6) {
1698 if (strcmp(p,
".")) {
1704 if (*(q-1) ==
';') *(q-1) = 0;
1705 for (r = p; *r; ++r)
1706 if (*r ==
';') ++n_flt;
1707 a = (
int32_t*)alloca(n_flt * 4);
1709 for (t =
kstrtok(p,
";", &aux1), i = 0; t; t =
kstrtok(0, 0, &aux1)) {
1716 fprintf(stderr,
"[W::%s] FILTER '%s' is not defined in the header\n", __func__, t);
1719 ksprintf(&tmp,
"##FILTER=<ID=%s,Description=\"Dummy\">", t);
1726 a[i++] =
kh_val(d, k).id;
1732 }
else if (i == 7) {
1736 if (strcmp(p,
".")) {
1737 if (*(q-1) ==
';') *(q-1) = 0;
1738 for (r = key = p;; ++r) {
1741 if (*r !=
';' && *r !=
'=' && *r != 0)
continue;
1746 for (end = val; *end !=
';' && *end != 0; ++end);
1749 k =
kh_get(vdict, d, key);
1752 fprintf(stderr,
"[W::%s] INFO '%s' is not defined in the header, assuming Type=String\n", __func__, key);
1755 ksprintf(&tmp,
"##INFO=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", key);
1759 k =
kh_get(vdict, d, key);
1764 bcf_enc_int1(str,
kh_val(d, k).
id);
1772 for (t = val, n_val = 1; *t; ++t)
1773 if (*t ==
',') ++n_val;
1776 z = (
int32_t*)alloca(n_val<<2);
1777 for (i = 0, t = val; i < n_val; ++i, ++t)
1779 z[i] = strtol(t, &te, 10);
1783 while ( *te && *te!=
',' ) te++;
1788 if (strcmp(key,
"END") == 0) v->
rlen = z[0] - v->
pos;
1791 z = (
float*)alloca(n_val<<2);
1792 for (i = 0, t = val; i < n_val; ++i, ++t)
1794 z[i] = strtod(t, &te);
1798 while ( *te && *te!=
',' ) te++;
1821 if (ret < 0)
return -1;
1828 fmt->
id = bcf_dec_typed_int1(ptr, &ptr);
1829 fmt->
n = bcf_dec_size(ptr, &ptr, &fmt->
type);
1832 fmt->
p_off = ptr - ptr_start;
1834 ptr += n_sample * fmt->
size;
1835 fmt->
p_len = ptr - fmt->
p;
1842 info->
key = bcf_dec_typed_int1(ptr, &ptr);
1843 info->
len = bcf_dec_size(ptr, &ptr, &info->
type);
1848 if (info->
len == 1) {
1872 tmp.
l = 0; tmp.
s = d->id; tmp.
m = d->m_id;
1877 d->id = tmp.
s; d->m_id = tmp.
m;
1880 tmp.
l = 0; tmp.
s = d->als; tmp.
m = d->m_als;
1881 offset = (
int*)alloca(b->
n_allele *
sizeof(
int));
1883 for (i = 0; i < b->
n_allele; ++i) {
1889 d->als = tmp.
s; d->m_als = tmp.
m;
1893 d->allele[i] = d->als + offset[i];
1896 if ((which&BCF_UN_FLT) && !(b->
unpacked&BCF_UN_FLT)) {
1901 d->n_flt = bcf_dec_size(ptr, &ptr, &type);
1903 for (i = 0; i < d->n_flt; ++i)
1904 d->flt[i] = bcf_dec_int1(ptr, type, &ptr);
1905 }
else ++ptr, d->n_flt = 0;
1909 if ((which&BCF_UN_INFO) && !(b->
unpacked&BCF_UN_INFO)) {
1912 for (i = 0; i < d->m_info; ++i) d->info[i].vptr_free = 0;
1914 ptr = bcf_unpack_info_core1(ptr, &d->info[i]);
1920 for (i = 0; i < d->m_fmt; ++i) d->fmt[i].p_free = 0;
1922 ptr = bcf_unpack_fmt_core1(ptr, b->
n_sample, &d->fmt[i]);
1933 kputc(
'\t', s); kputw(v->
pos + 1, s);
1934 kputc(
'\t', s); kputs(v->
d.
id ? v->
d.
id :
".", s);
1940 for (i = 1; i < v->
n_allele; ++i) {
1941 if (i > 1) kputc(
',', s);
1944 }
else kputc(
'.', s);
1950 for (i = 0; i < v->
d.
n_flt; ++i) {
1951 if (i) kputc(
';', s);
1954 }
else kputc(
'.', s);
1958 for (i = 0; i < v->
n_info; ++i) {
1960 if ( !z->
vptr )
continue;
1961 if ( !first ) kputc(
';', s); first = 0;
1963 if (z->
len <= 0)
continue;
1968 else kputc(z->
v1.
i, s);
1971 if ( first ) kputc(
'.', s);
1972 }
else kputc(
'.', s);
1982 for (i = 0; i < (int)v->
n_fmt; ++i) {
1983 if ( !fmt[i].p )
continue;
1984 kputc(!first ?
':' :
'\t', s); first = 0;
1987 fprintf(stderr,
"[E::%s] invalid BCF, the FORMAT tag id=%d not present in the header.\n", __func__, fmt[i].
id);
1993 if ( first ) kputs(
"\t.", s);
1994 for (j = 0; j < v->
n_sample; ++j) {
1997 for (i = 0; i < (int)v->
n_fmt; ++i) {
1999 if ( !f->
p )
continue;
2000 if (!first) kputc(
':', s); first = 0;
2002 bcf_format_gt(f,j,s);
2006 if ( first ) kputc(
'.', s);
2020 if ( line->
s[line->
l-1]!=
'\n' ) kputc(
'\n',line);
2025 return ret==line->
l ? 0 : -1;
2037 return ret==fp->
line.
l ? 0 : -1;
2047 vdict_t *d = (vdict_t*)h->
dict[which];
2048 k =
kh_get(vdict, d,
id);
2065 if ( !h )
return NULL;
2073 if ( !max_len ) max_len = ((
int64_t)1<<31) - 1;
2075 for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
2098 if ((fp =
hts_open(fn,
"rb")) == 0)
return -1;
2102 if ( !idx )
return -1;
2114 int i, ndst_ori = dst->
nhrec, need_sync = 0;
2115 for (i=0; i<src->
nhrec; i++)
2120 for (j=0; j<ndst_ori; j++)
2142 fprintf(stderr,
"[%s:%d %s] Unchecked error (%d), exiting.\n", __FILE__,__LINE__,__FUNCTION__,line->
errcode);
2145 if ( src_hdr->
ntransl==-1 )
return 0;
2149 for (dict=0; dict<2; dict++)
2151 src_hdr->
transl[dict] = (
int*) malloc(src_hdr->
n[dict]*
sizeof(
int));
2152 for (i=0; i<src_hdr->
n[dict]; i++)
2154 if ( i>=dst_hdr->
n[dict] || strcmp(src_hdr->
id[dict][i].
key,dst_hdr->
id[dict][i].
key) )
2160 src_hdr->
transl[dict][i] = -1;
2169 if ( src_hdr->
ntransl==-1 )
return 0;
2177 for (i=0; i<line->
d.
n_flt; i++)
2179 int src_id = line->
d.
flt[i];
2186 for (i=0; i<line->
n_info; i++)
2190 if ( dst_id<0 )
continue;
2193 if ( src_size==dst_size )
2206 bcf_enc_int1(&str, dst_id);
2207 bcf_enc_size(&str, info->
len,info->
type);
2218 for (i=0; i<line->
n_fmt; i++)
2220 int src_id = line->
d.
fmt[i].
id;
2222 if ( dst_id<0 )
continue;
2225 if ( src_size==dst_size )
2227 line->
d.
fmt[i].
id = dst_id;
2231 else {
uint8_t *x = (
uint8_t*) &dst_id; p[1] = x[0]; p[2] = x[1]; p[3] = x[2]; p[4] = x[3]; }
2238 bcf_enc_int1(&str, dst_id);
2239 bcf_enc_size(&str, fmt->
n, fmt->
type);
2241 kputsn((
char*)fmt->
p, fmt->
p_len, &str);
2266 str.
l = str.
m = 0; str.
s = 0;
2270 for (j=0; j<n; j++) imap[j] = -1;
2273 int i = 0, end = n? 8 : 7;
2274 while ((p = strstr(htxt,
"#CHROM\t")) != 0)
2275 if (p > htxt && *(p-1) ==
'\n')
break;
2276 while ((p = strchr(p,
'\t')) != 0 && i < end) ++i, ++p;
2278 free(h); free(str.
s);
2281 kputsn(htxt, p - htxt, &str);
2282 for (i = 0; i < n; ++i) {
2284 if (imap[i] < 0)
continue;
2286 kputs(samples[i], &str);
2288 }
else kputsn(htxt, hlen, &str);
2289 while (str.
l && (!str.
s[str.
l-1] || str.
s[str.
l-1]==
'\n') ) str.
l--;
2299 if ( samples && !strcmp(
"-",samples) )
return 0;
2306 if ( samples[0]==
'^' )
2309 int idx, n, ret = 0;
2310 char **smpls =
hts_readlist(samples[0]==
'^'?samples+1:samples, is_file, &n);
2311 if ( !smpls )
return -1;
2317 if ( !ret ) ret = i+1;
2321 if ( samples[0]==
'^' )
2326 for (i=0; i<n; i++) free(smpls[i]);
2354 kh_val(d, k) = bcf_idinfo_def;
2366 ind.
s = 0; ind.
l = ind.
m = 0;
2373 ptr = bcf_unpack_fmt_core1(ptr, v->
n_sample, &fmt[i]);
2374 for (i = 0; i < (int)v->
n_fmt; ++i) {
2376 bcf_enc_int1(&ind, f->
id);
2377 bcf_enc_size(&ind, f->
n, f->
type);
2378 for (j = 0; j < n; ++j)
2379 if (imap[j] >= 0) kputsn((
char*)(f->
p + imap[j] * f->
size), f->
size, &ind);
2381 for (i = j = 0; j < n; ++j) if (imap[j] >= 0) ++i;
2396 if (strlen(v->
d.
allele[i]) != 1)
break;
2400 static void bcf_set_variant_type(
const char *ref,
const char *alt,
variant_t *var)
2403 if ( !ref[1] && !alt[1] )
2405 if ( *alt ==
'.' || *ref==*alt ) { var->
n = 0; var->
type =
VCF_REF;
return; }
2406 if ( *alt ==
'X' ) { var->
n = 0; var->
type =
VCF_REF;
return; }
2410 const char *r = ref, *a = alt;
2411 while (*r && *a && *r==*a ) { r++; a++; }
2418 else if ( *r && !*a )
2423 else if ( !*r && !*a )
2428 const char *re = r, *ae = a;
2429 while ( re[1] ) re++;
2430 while ( ae[1] ) ae++;
2431 while ( *re==*ae && re>r && ae>a ) { re--; ae--; }
2434 if ( re==r ) { var->
n = 1; var->
type =
VCF_SNP;
return; }
2447 var->
n = ( re-r > ae-a ) ? -(re-r+1) : ae-a+1;
2452 static void bcf_set_variant_types(
bcf1_t *b)
2473 if ( rec->
d.
var_type==-1 ) bcf_set_variant_types(rec);
2478 if ( rec->
d.
var_type==-1 ) bcf_set_variant_types(rec);
2479 return rec->
d.
var[ith_allele].
type;
2489 for (i=0; i<line->
n_info; i++)
2490 if ( inf_id==line->
d.
info[i].
key )
break;
2511 bcf_enc_int1(&str, inf_id);
2525 fprintf(stderr,
"[E::%s] the type %d not implemented yet\n", __func__, type);
2537 memcpy(ptr, str.
s, str.
l);
2540 bcf_unpack_info_core1(ptr, inf);
2546 bcf_unpack_info_core1((
uint8_t*)str.
s, inf);
2557 bcf_unpack_info_core1((
uint8_t*)str.
s, inf);
2573 int len = strlen(values[i]);
2574 if ( len > max_len ) max_len = len;
2576 char *out = (
char*) malloc(max_len*n);
2577 if ( !out )
return -2;
2580 char *dst = out+i*max_len;
2581 const char *src = values[i];
2583 while ( src[j] ) { dst[j] = src[j]; j++; }
2584 for (; j<max_len; j++) dst[j] = 0;
2603 for (i=0; i<line->
n_fmt; i++)
2604 if ( line->
d.
fmt[i].
id==fmt_id )
break;
2614 free(fmt->
p - fmt->
p_off);
2625 assert( nps && nps*line->
n_sample==n );
2629 bcf_enc_int1(&str, fmt_id);
2635 kputsn((
char*)values, nps*line->
n_sample*
sizeof(
float), &str);
2640 kputsn((
char*)values, nps*line->
n_sample, &str);
2644 fprintf(stderr,
"[E::%s] the type %d not implemented yet\n", __func__, type);
2655 if ( line->
n_fmt > 1 && key[0]==
'G' && key[1]==
'T' && !key[2] )
2657 for (i=line->
n_fmt-1; i>0; i--)
2659 fmt = &line->
d.
fmt[0];
2675 memcpy(ptr, str.
s, str.
l);
2677 int p_free = fmt->
p_free;
2678 bcf_unpack_fmt_core1(ptr, line->
n_sample, fmt);
2703 line->
d.
flt[i] = flt_ids[i];
2711 for (i=0; i<line->
d.
n_flt; i++)
2712 if ( flt_id==line->
d.
flt[i] )
break;
2713 if ( i<line->d.
n_flt )
return 0;
2717 else if ( line->
d.
n_flt==1 && line->
d.
flt[0]==0 )
2729 for (i=0; i<line->
d.
n_flt; i++)
2730 if ( flt_id==line->
d.
flt[i] )
break;
2731 if ( i==line->
d.
n_flt )
return 0;
2741 if ( filter[0]==
'.' && !filter[1] ) filter =
"PASS";
2746 if (
id==0 && !line->
d.
n_flt)
return 1;
2749 for (i=0; i<line->
d.
n_flt; i++)
2750 if ( line->
d.
flt[i]==
id )
return 1;
2754 static inline int _bcf1_sync_alleles(
const bcf_hdr_t *hdr,
bcf1_t *line,
int nals)
2761 char *als = line->
d.
als;
2766 while ( *als ) als++;
2775 char *free_old = NULL;
2779 for (i=0; i<nals; i++)
2780 if ( alleles[i]>=line->
d.
als && alleles[i]<line->
d.
als+line->
d.
m_als )
break;
2787 free_old = line->
d.
als;
2789 for (i=0; i<nals; i++)
2791 kputs(alleles[i], &tmp);
2796 return _bcf1_sync_alleles(hdr,line,nals);
2803 kputs(alleles_string, &tmp);
2807 char *t = line->
d.
als;
2810 if ( *t==
',' ) { *t = 0; nals++; }
2813 return _bcf1_sync_alleles(hdr, line, nals);
2819 tmp.
l = 0; tmp.
s = line->
d.
id; tmp.
m = line->
d.
m_id;
2834 for (i=0; i<line->
n_fmt; i++)
2836 if ( line->
d.
fmt[i].
id==
id )
return &line->
d.
fmt[i];
2846 for (i=0; i<line->
n_info; i++)
2861 for (i=0; i<line->
n_info; i++)
2862 if ( line->
d.
info[i].
key==tag_id )
break;
2869 if ( *ndst < info->len+1 )
2871 *ndst = info->
len + 1;
2872 *dst = realloc(*dst, *ndst);
2874 memcpy(*dst,info->
vptr,info->
len);
2881 if ( *ndst < info->len )
2884 *dst = realloc(*dst, *ndst * size1);
2887 if ( info->
len == 1 )
2894 #define BRANCH(type_t, is_missing, is_vector_end, set_missing, out_type_t) { \
2895 out_type_t *tmp = (out_type_t *) *dst; \
2896 type_t *p = (type_t *) info->vptr; \
2897 for (j=0; j<info->len; j++) \
2899 if ( is_vector_end ) return j; \
2900 if ( is_missing ) set_missing; \
2906 switch (info->
type) {
2911 default: fprintf(stderr,
"TODO: %s:%d .. info->type=%d\n", __FILE__,__LINE__, info->
type); exit(1);
2925 for (i=0; i<line->
n_fmt; i++)
2926 if ( line->
d.
fmt[i].
id==tag_id )
break;
2927 if ( i==line->
n_fmt )
return -3;
2933 *dst = (
char**) malloc(
sizeof(
char*)*nsmpl);
2934 if ( !*dst )
return -4;
2937 int n = (fmt->
n+1)*nsmpl;
2940 (*dst)[0] = realloc((*dst)[0], n);
2941 if ( !(*dst)[0] )
return -4;
2944 for (i=0; i<nsmpl; i++)
2948 memcpy(tmp,src,fmt->
n);
2950 (*dst)[i] = (
char*) tmp;
2959 if ( tag[0]==
'G' && tag[1]==
'T' && tag[2]==0 )
2968 for (i=0; i<line->
n_fmt; i++)
2969 if ( line->
d.
fmt[i].
id==tag_id )
break;
2970 if ( i==line->
n_fmt )
return -3;
2978 *dst = realloc(*dst, n);
2979 if ( !*dst )
return -4;
2982 memcpy(*dst,fmt->
p,n);
2989 if ( *ndst < fmt->n*nsmpl )
2991 *ndst = fmt->
n*nsmpl;
2992 *dst = realloc(*dst, *ndst*size1);
2993 if ( !dst )
return -4;
2996 #define BRANCH(type_t, is_missing, is_vector_end, set_missing, set_vector_end, out_type_t) { \
2997 out_type_t *tmp = (out_type_t *) *dst; \
2998 type_t *p = (type_t*) fmt->p; \
2999 for (i=0; i<nsmpl; i++) \
3001 for (j=0; j<fmt->n; j++) \
3003 if ( is_missing ) set_missing; \
3004 else if ( is_vector_end ) { set_vector_end; break; } \
3008 for (; j<fmt->n; j++) { set_vector_end; tmp++; } \
3009 p = (type_t *)((char *)p + fmt->size); \
3012 switch (fmt->
type) {
3017 default: fprintf(stderr,
"TODO: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt->
type); exit(1);
3020 return nsmpl*fmt->
n;