NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
vcf.c
Go to the documentation of this file.
1 #include <zlib.h>
2 #include <stdio.h>
3 #include <ctype.h>
4 #include <assert.h>
5 #include <string.h>
6 #include <stdlib.h>
7 #include <limits.h>
8 #include "htslib/kstring.h"
9 #include "htslib/bgzf.h"
10 #include "htslib/vcf.h"
11 #include "htslib/tbx.h"
12 #include "htslib/hfile.h"
13 
14 #include "htslib/khash.h"
16  typedef khash_t(vdict) vdict_t;
17 
18 #include "htslib/kseq.h"
20 
21  uint32_t bcf_float_missing = 0x7F800001;
22  uint32_t bcf_float_vector_end = 0x7F800002;
23  uint8_t bcf_type_shift[] = { 0, 0, 1, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
24 static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, NULL, NULL}, .id = -1 };
25 
26 /*************************
27  *** VCF header parser ***
28  *************************/
29 
30 int bcf_hdr_sync(bcf_hdr_t *h);
31 
32 int bcf_hdr_add_sample(bcf_hdr_t *h, const char *s)
33 {
34  if ( !s )
35  {
36  bcf_hdr_sync(h);
37  return 0;
38  }
39 
40  const char *ss = s;
41  while ( !*ss && isspace(*ss) ) ss++;
42  if ( !*ss )
43  {
44  fprintf(stderr,"[W::%s] Empty sample name: trailing spaces/tabs in the header line?\n", __func__);
45  abort();
46  }
47 
48  vdict_t *d = (vdict_t*)h->dict[BCF_DT_SAMPLE];
49  int ret;
50  char *sdup = strdup(s);
51  int k = kh_put(vdict, d, sdup, &ret);
52  if (ret) { // absent
53  kh_val(d, k) = bcf_idinfo_def;
54  kh_val(d, k).id = kh_size(d) - 1;
55  } else {
56  if (hts_verbose >= 2)
57  fprintf(stderr, "[W::%s] Duplicated sample name '%s'. Skipped.\n", __func__, s);
58  free(sdup);
59  return -1;
60  }
61  int n = kh_size(d);
62  h->samples = (char**) realloc(h->samples,sizeof(char*)*n);
63  h->samples[n-1] = sdup;
64  return 0;
65 }
66 
67 void bcf_hdr_parse_sample_line(bcf_hdr_t *h, const char *str)
68 {
69  int i = 0;
70  const char *p, *q;
71  // add samples
72  for (p = q = str;; ++q) {
73  if (*q != '\t' && *q != 0 && *q != '\n') continue;
74  if (++i > 9) {
75  char *s = (char*)malloc(q - p + 1);
76  strncpy(s, p, q - p);
77  s[q - p] = 0;
78  bcf_hdr_add_sample(h,s);
79  free(s);
80  }
81  if (*q == 0 || *q == '\n') break;
82  p = q + 1;
83  }
84  bcf_hdr_add_sample(h,NULL);
85 }
86 
88 {
89  int i;
90  for (i = 0; i < 3; i++)
91  {
92  vdict_t *d = (vdict_t*)h->dict[i];
93  khint_t k;
94 
95  // find out the largest id, there may be holes because of IDX
96  int max_id = -1;
97  for (k=kh_begin(d); k<kh_end(d); k++)
98  {
99  if (!kh_exist(d,k)) continue;
100  if ( max_id < kh_val(d,k).id ) max_id = kh_val(d,k).id;
101  }
102  if ( max_id >= h->n[i] )
103  {
104  h->id[i] = (bcf_idpair_t*)realloc(h->id[i], (max_id+1)*sizeof(bcf_idpair_t));
105  for (k=h->n[i]; k<=max_id; k++)
106  {
107  h->id[i][k].key = NULL;
108  h->id[i][k].val = NULL;
109  }
110  h->n[i] = max_id+1;
111  }
112  for (k=kh_begin(d); k<kh_end(d); k++)
113  {
114  if (!kh_exist(d,k)) continue;
115  h->id[i][kh_val(d,k).id].key = kh_key(d,k);
116  h->id[i][kh_val(d,k).id].val = &kh_val(d,k);
117  }
118  }
119  return 0;
120 }
121 
123 {
124  free(hrec->key);
125  if ( hrec->value ) free(hrec->value);
126  int i;
127  for (i=0; i<hrec->nkeys; i++)
128  {
129  free(hrec->keys[i]);
130  free(hrec->vals[i]);
131  }
132  free(hrec->keys);
133  free(hrec->vals);
134  free(hrec);
135 }
136 
137 // Copies all fields except IDX.
139 {
140  bcf_hrec_t *out = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
141  out->type = hrec->type;
142  if ( hrec->key ) out->key = strdup(hrec->key);
143  if ( hrec->value ) out->value = strdup(hrec->value);
144  out->nkeys = hrec->nkeys;
145  out->keys = (char**) malloc(sizeof(char*)*hrec->nkeys);
146  out->vals = (char**) malloc(sizeof(char*)*hrec->nkeys);
147  int i, j = 0;
148  for (i=0; i<hrec->nkeys; i++)
149  {
150  if ( hrec->keys[i] && !strcmp("IDX",hrec->keys[i]) ) continue;
151  if ( hrec->keys[i] ) out->keys[j] = strdup(hrec->keys[i]);
152  if ( hrec->vals[i] ) out->vals[j] = strdup(hrec->vals[i]);
153  j++;
154  }
155  if ( i!=j ) out->nkeys--; // IDX was omitted
156  return out;
157 }
158 
159 void bcf_hrec_debug(FILE *fp, bcf_hrec_t *hrec)
160 {
161  fprintf(fp, "key=[%s] value=[%s]", hrec->key, hrec->value?hrec->value:"");
162  int i;
163  for (i=0; i<hrec->nkeys; i++)
164  fprintf(fp, "\t[%s]=[%s]", hrec->keys[i],hrec->vals[i]);
165  fprintf(fp, "\n");
166 }
167 
169 {
170  int i, j;
171  for (i=0; i<hdr->nhrec; i++)
172  {
173  if ( !hdr->hrec[i]->value )
174  {
175  fprintf(stderr, "##%s=<", hdr->hrec[i]->key);
176  fprintf(stderr,"%s=%s", hdr->hrec[i]->keys[0], hdr->hrec[i]->vals[0]);
177  for (j=1; j<hdr->hrec[i]->nkeys; j++)
178  fprintf(stderr,",%s=%s", hdr->hrec[i]->keys[j], hdr->hrec[i]->vals[j]);
179  fprintf(stderr,">\n");
180  }
181  else
182  fprintf(stderr,"##%s=%s\n", hdr->hrec[i]->key,hdr->hrec[i]->value);
183  }
184 }
185 
186 void bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, int len)
187 {
188  int n = ++hrec->nkeys;
189  hrec->keys = (char**) realloc(hrec->keys, sizeof(char*)*n);
190  hrec->vals = (char**) realloc(hrec->vals, sizeof(char*)*n);
191  assert( len );
192  hrec->keys[n-1] = (char*) malloc((len+1)*sizeof(char));
193  memcpy(hrec->keys[n-1],str,len);
194  hrec->keys[n-1][len] = 0;
195  hrec->vals[n-1] = NULL;
196 }
197 
198 void bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, int len, int is_quoted)
199 {
200  if ( !str ) { hrec->vals[i] = NULL; return; }
201  if ( hrec->vals[i] ) free(hrec->vals[i]);
202  if ( is_quoted )
203  {
204  hrec->vals[i] = (char*) malloc((len+3)*sizeof(char));
205  hrec->vals[i][0] = '"';
206  memcpy(&hrec->vals[i][1],str,len);
207  hrec->vals[i][len+1] = '"';
208  hrec->vals[i][len+2] = 0;
209  }
210  else
211  {
212  hrec->vals[i] = (char*) malloc((len+1)*sizeof(char));
213  memcpy(hrec->vals[i],str,len);
214  hrec->vals[i][len] = 0;
215  }
216 }
217 
218 void hrec_add_idx(bcf_hrec_t *hrec, int idx)
219 {
220  int n = ++hrec->nkeys;
221  hrec->keys = (char**) realloc(hrec->keys, sizeof(char*)*n);
222  hrec->vals = (char**) realloc(hrec->vals, sizeof(char*)*n);
223  hrec->keys[n-1] = strdup("IDX");
224  kstring_t str = {0,0,0};
225  kputw(idx, &str);
226  hrec->vals[n-1] = str.s;
227 }
228 
229 int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key)
230 {
231  int i;
232  for (i=0; i<hrec->nkeys; i++)
233  if ( !strcasecmp(key,hrec->keys[i]) ) return i;
234  return -1;
235 }
236 
237 static inline int is_escaped(const char *min, const char *str)
238 {
239  int n = 0;
240  while ( --str>=min && *str=='\\' ) n++;
241  return n%2;
242 }
243 
244 bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len)
245 {
246  const char *p = line;
247  if (p[0] != '#' || p[1] != '#') { *len = 0; return NULL; }
248  p += 2;
249 
250  const char *q = p;
251  while ( *q && *q!='=' ) q++;
252  int n = q-p;
253  if ( *q!='=' || !n ) { *len = q-line+1; return NULL; } // wrong format
254 
255  bcf_hrec_t *hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
256  hrec->key = (char*) malloc(sizeof(char)*(n+1));
257  memcpy(hrec->key,p,n);
258  hrec->key[n] = 0;
259 
260  p = ++q;
261  if ( *p!='<' ) // generic field, e.g. ##samtoolsVersion=0.1.18-r579
262  {
263  while ( *q && *q!='\n' ) q++;
264  hrec->value = (char*) malloc((q-p+1)*sizeof(char));
265  memcpy(hrec->value, p, q-p);
266  hrec->value[q-p] = 0;
267  *len = q-line+1;
268  return hrec;
269  }
270 
271  // structured line, e.g. ##INFO=<ID=PV1,Number=1,Type=Float,Description="P-value for baseQ bias">
272  int nopen = 1;
273  while ( *q && *q!='\n' && nopen )
274  {
275  p = ++q;
276  while ( *q && *q!='=' ) q++;
277  n = q-p;
278  if ( *q!='=' || !n ) { *len = q-line+1; bcf_hrec_destroy(hrec); return NULL; } // wrong format
279  bcf_hrec_add_key(hrec, p, q-p);
280  p = ++q;
281  int quoted = *p=='"' ? 1 : 0;
282  if ( quoted ) p++, q++;
283  while (1)
284  {
285  if ( !*q ) break;
286  if ( quoted ) { if ( *q=='"' && !is_escaped(p,q) ) break; }
287  else
288  {
289  if ( *q=='<' ) nopen++;
290  if ( *q=='>' ) nopen--;
291  if ( !nopen ) break;
292  if ( *q==',' && nopen==1 ) break;
293  }
294  q++;
295  }
296  bcf_hrec_set_val(hrec, hrec->nkeys-1, p, q-p, quoted);
297  if ( quoted ) q++;
298  if ( *q=='>' ) { nopen--; q++; }
299  }
300  *len = q-line+1;
301  return hrec;
302 }
303 
304 // returns: 1 when hdr needs to be synced, 0 otherwise
306 {
307  // contig
308  int i,j,k, ret;
309  char *str;
310  if ( !strcmp(hrec->key, "contig") )
311  {
312  hrec->type = BCF_HL_CTG;
313 
314  // Get the contig ID ($str) and length ($j)
315  i = bcf_hrec_find_key(hrec,"length");
316  if ( i<0 ) return 0;
317  if ( sscanf(hrec->vals[i],"%d",&j)!=1 ) return 0;
318 
319  i = bcf_hrec_find_key(hrec,"ID");
320  if ( i<0 ) return 0;
321  str = strdup(hrec->vals[i]);
322 
323  // Register in the dictionary
324  vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_CTG];
325  k = kh_put(vdict, d, str, &ret);
326  if ( !ret ) { free(str); return 0; } // already present
327 
328  int idx = bcf_hrec_find_key(hrec,"IDX");
329  if ( idx!=-1 )
330  {
331  char *tmp = hrec->vals[idx];
332  idx = strtol(hrec->vals[idx], &tmp, 10);
333  if ( *tmp )
334  {
335  fprintf(stderr,"[%s:%d %s] Error parsing the IDX tag, skipping.\n", __FILE__,__LINE__,__FUNCTION__);
336  return 0;
337  }
338  }
339  else
340  {
341  idx = kh_size(d) - 1;
342  hrec_add_idx(hrec, idx);
343  }
344 
345  kh_val(d, k) = bcf_idinfo_def;
346  kh_val(d, k).id = idx;
347  kh_val(d, k).info[0] = i;
348  kh_val(d, k).hrec[0] = hrec;
349 
350  return 1;
351  }
352 
353  if ( !strcmp(hrec->key, "INFO") ) hrec->type = BCF_HL_INFO;
354  else if ( !strcmp(hrec->key, "FILTER") ) hrec->type = BCF_HL_FLT;
355  else if ( !strcmp(hrec->key, "FORMAT") ) hrec->type = BCF_HL_FMT;
356  else if ( hrec->nkeys>0 ) { hrec->type = BCF_HL_STR; return 1; }
357  else return 0;
358 
359  // INFO/FILTER/FORMAT
360  char *id = NULL;
361  int type = -1, num = -1, var = -1, idx = -1;
362  for (i=0; i<hrec->nkeys; i++)
363  {
364  if ( !strcmp(hrec->keys[i], "ID") ) id = hrec->vals[i];
365  else if ( !strcmp(hrec->keys[i], "IDX") )
366  {
367  char *tmp = hrec->vals[i];
368  idx = strtol(hrec->vals[i], &tmp, 10);
369  if ( *tmp )
370  {
371  fprintf(stderr,"[%s:%d %s] Error parsing the IDX tag, skipping.\n", __FILE__,__LINE__,__FUNCTION__);
372  return 0;
373  }
374  }
375  else if ( !strcmp(hrec->keys[i], "Type") )
376  {
377  if ( !strcmp(hrec->vals[i], "Integer") ) type = BCF_HT_INT;
378  else if ( !strcmp(hrec->vals[i], "Float") ) type = BCF_HT_REAL;
379  else if ( !strcmp(hrec->vals[i], "String") ) type = BCF_HT_STR;
380  else if ( !strcmp(hrec->vals[i], "Flag") ) type = BCF_HT_FLAG;
381  else
382  {
383  fprintf(stderr, "[E::%s] The type \"%s\" not supported, assuming \"String\"\n", __func__, hrec->vals[i]);
384  type = BCF_HT_STR;
385  }
386  }
387  else if ( !strcmp(hrec->keys[i], "Number") )
388  {
389  if ( !strcmp(hrec->vals[i],"A") ) var = BCF_VL_A;
390  else if ( !strcmp(hrec->vals[i],"R") ) var = BCF_VL_R;
391  else if ( !strcmp(hrec->vals[i],"G") ) var = BCF_VL_G;
392  else if ( !strcmp(hrec->vals[i],".") ) var = BCF_VL_VAR;
393  else
394  {
395  sscanf(hrec->vals[i],"%d",&num);
396  var = BCF_VL_FIXED;
397  }
398  if (var != BCF_VL_FIXED) num = 0xfffff;
399 
400  }
401  }
402  uint32_t info = (uint32_t)num<<12 | var<<8 | type<<4 | hrec->type;
403 
404  if ( !id ) return 0;
405  str = strdup(id);
406 
407  vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_ID];
408  k = kh_put(vdict, d, str, &ret);
409  if ( !ret )
410  {
411  // already present
412  free(str);
413  if ( kh_val(d, k).hrec[info&0xf] ) return 0;
414  kh_val(d, k).info[info&0xf] = info;
415  kh_val(d, k).hrec[info&0xf] = hrec;
416  return 1;
417  }
418  kh_val(d, k) = bcf_idinfo_def;
419  kh_val(d, k).info[info&0xf] = info;
420  kh_val(d, k).hrec[info&0xf] = hrec;
421  kh_val(d, k).id = idx==-1 ? kh_size(d) - 1 : idx;
422 
423  if ( idx==-1 ) hrec_add_idx(hrec, kh_val(d, k).id);
424 
425  return 1;
426 }
427 
429 {
430  hrec->type = BCF_HL_GEN;
431  if ( !bcf_hdr_register_hrec(hdr,hrec) )
432  {
433  // If one of the hashed field, then it is already present
434  if ( hrec->type != BCF_HL_GEN )
435  {
436  bcf_hrec_destroy(hrec);
437  return 0;
438  }
439 
440  // Is one of the generic fields and already present?
441  int i;
442  for (i=0; i<hdr->nhrec; i++)
443  {
444  if ( hdr->hrec[i]->type!=BCF_HL_GEN ) continue;
445  if ( !strcmp(hdr->hrec[i]->key,hrec->key) && !strcmp(hrec->key,"fileformat") ) break;
446  if ( !strcmp(hdr->hrec[i]->key,hrec->key) && !strcmp(hdr->hrec[i]->value,hrec->value) ) break;
447  }
448  if ( i<hdr->nhrec )
449  {
450  bcf_hrec_destroy(hrec);
451  return 0;
452  }
453  }
454 
455  // New record, needs to be added
456  int n = ++hdr->nhrec;
457  hdr->hrec = (bcf_hrec_t**) realloc(hdr->hrec, n*sizeof(bcf_hrec_t*));
458  hdr->hrec[n-1] = hrec;
459 
460  return hrec->type==BCF_HL_GEN ? 0 : 1;
461 }
462 
463 bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *id)
464 {
465  int i;
466  if ( type==BCF_HL_GEN )
467  {
468  for (i=0; i<hdr->nhrec; i++)
469  {
470  if ( hdr->hrec[i]->type!=BCF_HL_GEN ) continue;
471  if ( !strcmp(hdr->hrec[i]->key,id) ) return hdr->hrec[i];
472  }
473  return NULL;
474  }
475  vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
476  khint_t k = kh_get(vdict, d, id);
477  if ( k == kh_end(d) ) return NULL;
478  return kh_val(d, k).hrec[type==BCF_HL_CTG?0:type];
479 }
480 
482 {
483  static int PL_warned = 0, GL_warned = 0;
484 
485  if ( !PL_warned )
486  {
487  int id = bcf_hdr_id2int(hdr, BCF_DT_ID, "PL");
489  {
490  fprintf(stderr,"[W::%s] PL should be declared as Number=G\n", __func__);
491  PL_warned = 1;
492  }
493  }
494  if ( !GL_warned )
495  {
496  int id = bcf_hdr_id2int(hdr, BCF_HL_FMT, "GL");
498  {
499  fprintf(stderr,"[W::%s] GL should be declared as Number=G\n", __func__);
500  PL_warned = 1;
501  }
502  }
503 }
504 
505 int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt)
506 {
507  int len, needs_sync = 0;
508  char *p = htxt;
509 
510  // Check sanity: "fileformat" string must come as first
511  bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,p,&len);
512  if ( !hrec->key || strcasecmp(hrec->key,"fileformat") )
513  fprintf(stderr, "[W::%s] The first line should be ##fileformat; is the VCF/BCF header broken?\n", __func__);
514  needs_sync += bcf_hdr_add_hrec(hdr, hrec);
515 
516  // The filter PASS must appear first in the dictionary
517  hrec = bcf_hdr_parse_line(hdr,"##FILTER=<ID=PASS,Description=\"All filters passed\">",&len);
518  needs_sync += bcf_hdr_add_hrec(hdr, hrec);
519 
520  // Parse the whole header
521  while ( (hrec=bcf_hdr_parse_line(hdr,p,&len)) )
522  {
523  needs_sync += bcf_hdr_add_hrec(hdr, hrec);
524  p += len;
525  }
527  if ( needs_sync ) bcf_hdr_sync(hdr);
529  return 0;
530 }
531 
532 int bcf_hdr_append(bcf_hdr_t *hdr, const char *line)
533 {
534  int len;
535  bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr, (char*) line, &len);
536  if ( !hrec ) return -1;
537  if ( bcf_hdr_add_hrec(hdr, hrec) )
538  bcf_hdr_sync(hdr);
539  return 0;
540 }
541 
542 void bcf_hdr_remove(bcf_hdr_t *hdr, int type, const char *key)
543 {
544  int i;
545  bcf_hrec_t *hrec;
546  while (1)
547  {
548  if ( type==BCF_HL_FLT || type==BCF_HL_INFO || type==BCF_HL_FMT || type== BCF_HL_CTG )
549  {
550  hrec = bcf_hdr_get_hrec(hdr, type, key);
551  if ( !hrec ) return;
552 
553  for (i=0; i<hdr->nhrec; i++)
554  if ( hdr->hrec[i]==hrec ) break;
555  assert( i<hdr->nhrec );
556 
557  vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
558  khint_t k = kh_get(vdict, d, key);
559  kh_val(d, k).hrec[type==BCF_HL_CTG?0:type] = NULL;
560  }
561  else
562  {
563  for (i=0; i<hdr->nhrec; i++)
564  {
565  if ( hdr->hrec[i]->type!=type ) continue;
566  if ( !strcmp(hdr->hrec[i]->key,key) ) break;
567  }
568  if ( i==hdr->nhrec ) return;
569  hrec = hdr->hrec[i];
570  }
571 
572  hdr->nhrec--;
573  if ( i < hdr->nhrec )
574  memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
575  bcf_hrec_destroy(hrec);
576 
577  bcf_hdr_sync(hdr);
578  }
579 }
580 
581 int bcf_hdr_printf(bcf_hdr_t *hdr, const char *fmt, ...)
582 {
583  va_list ap;
584  va_start(ap, fmt);
585  int n = vsnprintf(NULL, 0, fmt, ap) + 2;
586  va_end(ap);
587 
588  char *line = (char*)malloc(n);
589  va_start(ap, fmt);
590  vsnprintf(line, n, fmt, ap);
591  va_end(ap);
592 
593  int ret = bcf_hdr_append(hdr, line);
594 
595  free(line);
596  return ret;
597 }
598 
599 
600 /**********************
601  *** BCF header I/O ***
602  **********************/
603 
604 const char *bcf_hdr_get_version(const bcf_hdr_t *hdr)
605 {
606  bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat");
607  if ( !hrec )
608  {
609  fprintf(stderr,"No version string found, assuming VCFv4.2\n");
610  return "VCFv4.2";
611  }
612  return hrec->value;
613 }
614 
615 void bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version)
616 {
617  bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat");
618  if ( !hrec )
619  {
620  int len;
621  kstring_t str = {0,0,0};
622  ksprintf(&str,"##fileformat=%s", version);
623  hrec = bcf_hdr_parse_line(hdr, str.s, &len);
624  free(str.s);
625  }
626  else
627  {
628  free(hrec->value);
629  hrec->value = strdup(version);
630  }
631  bcf_hdr_sync(hdr);
632 }
633 
634 bcf_hdr_t *bcf_hdr_init(const char *mode)
635 {
636  int i;
637  bcf_hdr_t *h;
638  h = (bcf_hdr_t*)calloc(1, sizeof(bcf_hdr_t));
639  for (i = 0; i < 3; ++i)
640  h->dict[i] = kh_init(vdict);
641  if ( strchr(mode,'w') )
642  {
643  bcf_hdr_append(h, "##fileformat=VCFv4.2");
644  // The filter PASS must appear first in the dictionary
645  bcf_hdr_append(h, "##FILTER=<ID=PASS,Description=\"All filters passed\">");
646  }
647  return h;
648 }
649 
651 {
652  int i;
653  khint_t k;
654  for (i = 0; i < 3; ++i) {
655  vdict_t *d = (vdict_t*)h->dict[i];
656  if (d == 0) continue;
657  for (k = kh_begin(d); k != kh_end(d); ++k)
658  if (kh_exist(d, k)) free((char*)kh_key(d, k));
659  kh_destroy(vdict, d);
660  free(h->id[i]);
661  }
662  for (i=0; i<h->nhrec; i++)
663  bcf_hrec_destroy(h->hrec[i]);
664  if (h->nhrec) free(h->hrec);
665  if (h->samples) free(h->samples);
666  free(h->keep_samples);
667  free(h->transl[0]); free(h->transl[1]);
668  free(h->mem.s);
669  free(h);
670 }
671 
673 {
674  if (!hfp->is_bin)
675  return vcf_hdr_read(hfp);
676 
677  BGZF *fp = hfp->fp.bgzf;
678  uint8_t magic[5];
679  bcf_hdr_t *h;
680  h = bcf_hdr_init("r");
681  if ( bgzf_read(fp, magic, 5)<0 )
682  {
683  fprintf(stderr,"[%s:%d %s] Failed to read the header (reading BCF in text mode?)\n", __FILE__,__LINE__,__FUNCTION__);
684  return NULL;
685  }
686  if (strncmp((char*)magic, "BCF\2\2", 5) != 0)
687  {
688  if (!strncmp((char*)magic, "BCF", 3))
689  fprintf(stderr,"[%s:%d %s] invalid BCF2 magic string: only BCFv2.2 is supported.\n", __FILE__,__LINE__,__FUNCTION__);
690  else if (hts_verbose >= 2)
691  fprintf(stderr, "[E::%s] invalid BCF2 magic string\n", __func__);
692  bcf_hdr_destroy(h);
693  return 0;
694  }
695  int hlen;
696  char *htxt;
697  bgzf_read(fp, &hlen, 4);
698  htxt = (char*)malloc(hlen);
699  bgzf_read(fp, htxt, hlen);
700  bcf_hdr_parse(h, htxt);
701  free(htxt);
702  return h;
703 }
704 
705 int bcf_hdr_write(htsFile *hfp, const bcf_hdr_t *h)
706 {
707  if (!hfp->is_bin) return vcf_hdr_write(hfp, h);
708 
709  int hlen;
710  char *htxt = bcf_hdr_fmt_text(h, 1, &hlen);
711  hlen++; // include the \0 byte
712 
713  BGZF *fp = hfp->fp.bgzf;
714  if ( bgzf_write(fp, "BCF\2\2", 5) !=5 ) return -1;
715  if ( bgzf_write(fp, &hlen, 4) !=4 ) return -1;
716  if ( bgzf_write(fp, htxt, hlen) != hlen ) return -1;
717 
718  free(htxt);
719  return 0;
720 }
721 
722 /********************
723  *** BCF site I/O ***
724  ********************/
725 
727 {
728  bcf1_t *v;
729  v = (bcf1_t*)calloc(1, sizeof(bcf1_t));
730  return v;
731 }
732 
734 {
735  int i;
736  for (i=0; i<v->d.m_info; i++)
737  {
738  if ( v->d.info[i].vptr_free )
739  {
740  free(v->d.info[i].vptr - v->d.info[i].vptr_off);
741  v->d.info[i].vptr_free = 0;
742  }
743  }
744  for (i=0; i<v->d.m_fmt; i++)
745  {
746  if ( v->d.fmt[i].p_free )
747  {
748  free(v->d.fmt[i].p - v->d.fmt[i].p_off);
749  v->d.fmt[i].p_free = 0;
750  }
751  }
752  v->rid = v->pos = v->rlen = v->unpacked = 0;
754  v->n_info = v->n_allele = v->n_fmt = v->n_sample = 0;
755  v->shared.l = v->indiv.l = 0;
756  v->d.var_type = -1;
757  v->d.shared_dirty = 0;
758  v->d.indiv_dirty = 0;
759  v->d.n_flt = 0;
760  v->errcode = 0;
761  if (v->d.m_als) v->d.als[0] = 0;
762  if (v->d.m_id) v->d.id[0] = 0;
763 }
764 
766 {
767  bcf_clear1(v);
768  free(v->d.id);
769  free(v->d.als);
770  free(v->d.allele); free(v->d.flt); free(v->d.info); free(v->d.fmt);
771  if (v->d.var ) free(v->d.var);
772  free(v->shared.s); free(v->indiv.s);
773 }
774 
776 {
777  bcf_empty1(v);
778  free(v);
779 }
780 
781 static inline int bcf_read1_core(BGZF *fp, bcf1_t *v)
782 {
783  uint32_t x[8];
784  int ret;
785  if ((ret = bgzf_read(fp, x, 32)) != 32) {
786  if (ret == 0) return -1;
787  return -2;
788  }
789  bcf_clear1(v);
790  x[0] -= 24; // to exclude six 32-bit integers
791  ks_resize(&v->shared, x[0]);
792  ks_resize(&v->indiv, x[1]);
793  memcpy(v, x + 2, 16);
794  v->n_allele = x[6]>>16; v->n_info = x[6]&0xffff;
795  v->n_fmt = x[7]>>24; v->n_sample = x[7]&0xffffff;
796  v->shared.l = x[0], v->indiv.l = x[1];
797 
798  // silent fix of broken BCFs produced by earlier versions of bcf_subset, prior to and including bd6ed8b4
799  if ( (!v->indiv.l || !v->n_sample) && v->n_fmt ) v->n_fmt = 0;
800 
801  bgzf_read(fp, v->shared.s, v->shared.l);
802  bgzf_read(fp, v->indiv.s, v->indiv.l);
803  return 0;
804 }
805 
806 #define bit_array_size(n) ((n)/8+1)
807 #define bit_array_set(a,i) ((a)[(i)/8] |= 1 << ((i)%8))
808 #define bit_array_clear(a,i) ((a)[(i)/8] &= ~(1 << ((i)%8)))
809 #define bit_array_test(a,i) ((a)[(i)/8] & (1 << ((i)%8)))
810 
811 static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt);
812 int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec)
813 {
814  if ( !hdr->keep_samples ) return 0;
815  if ( !bcf_hdr_nsamples(hdr) )
816  {
817  rec->indiv.l = rec->n_sample = 0;
818  return 0;
819  }
820 
821  int i, j;
822  uint8_t *ptr = (uint8_t*)rec->indiv.s, *dst = NULL, *src;
823  bcf_dec_t *dec = &rec->d;
824  hts_expand(bcf_fmt_t, rec->n_fmt, dec->m_fmt, dec->fmt);
825  for (i=0; i<dec->m_fmt; ++i) dec->fmt[i].p_free = 0;
826 
827  for (i=0; i<rec->n_fmt; i++)
828  {
829  ptr = bcf_unpack_fmt_core1(ptr, rec->n_sample, &dec->fmt[i]);
830  src = dec->fmt[i].p - dec->fmt[i].size;
831  if ( dst )
832  {
833  memmove(dec->fmt[i-1].p + dec->fmt[i-1].p_len, dec->fmt[i].p - dec->fmt[i].p_off, dec->fmt[i].p_off);
834  dec->fmt[i].p = dec->fmt[i-1].p + dec->fmt[i-1].p_len + dec->fmt[i].p_off;
835  }
836  dst = dec->fmt[i].p;
837  for (j=0; j<hdr->nsamples_ori; j++)
838  {
839  src += dec->fmt[i].size;
840  if ( !bit_array_test(hdr->keep_samples,j) ) continue;
841  memmove(dst, src, dec->fmt[i].size);
842  dst += dec->fmt[i].size;
843  }
844  rec->indiv.l -= dec->fmt[i].p_len - (dst - dec->fmt[i].p);
845  dec->fmt[i].p_len = dst - dec->fmt[i].p;
846  }
847  rec->unpacked |= BCF_UN_FMT;
848 
849  rec->n_sample = bcf_hdr_nsamples(hdr);
850  return 0;
851 }
852 
853 int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
854 {
855  if (!fp->is_bin) return vcf_read(fp,h,v);
856  int ret = bcf_read1_core(fp->fp.bgzf, v);
857  if ( ret!=0 || !h->keep_samples ) return ret;
858  return bcf_subset_format(h,v);
859 }
860 
861 int bcf_readrec(BGZF *fp, void *null, void *vv, int *tid, int *beg, int *end)
862 {
863  bcf1_t *v = (bcf1_t *) vv;
864  int ret;
865  if ((ret = bcf_read1_core(fp, v)) >= 0)
866  *tid = v->rid, *beg = v->pos, *end = v->pos + v->rlen;
867  return ret;
868 }
869 
870 static inline void bcf1_sync_id(bcf1_t *line, kstring_t *str)
871 {
872  // single typed string
873  if ( line->d.id && strcmp(line->d.id, ".") ) bcf_enc_vchar(str, strlen(line->d.id), line->d.id);
874  else bcf_enc_size(str, 0, BCF_BT_CHAR);
875 }
876 static inline void bcf1_sync_alleles(bcf1_t *line, kstring_t *str)
877 {
878  // list of typed strings
879  int i;
880  for (i=0; i<line->n_allele; i++)
881  bcf_enc_vchar(str, strlen(line->d.allele[i]), line->d.allele[i]);
882  if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
883 }
884 static inline void bcf1_sync_filter(bcf1_t *line, kstring_t *str)
885 {
886  // typed vector of integers
887  if ( line->d.n_flt ) bcf_enc_vint(str, line->d.n_flt, line->d.flt, -1);
888  else bcf_enc_vint(str, 0, 0, -1);
889 }
890 
891 static inline void bcf1_sync_info(bcf1_t *line, kstring_t *str)
892 {
893  // pairs of typed vectors
894  int i, irm = -1;
895  for (i=0; i<line->n_info; i++)
896  {
897  bcf_info_t *info = &line->d.info[i];
898  if ( !info->vptr )
899  {
900  // marked for removal
901  if ( irm < 0 ) irm = i;
902  continue;
903  }
904  kputsn_(info->vptr - info->vptr_off, info->vptr_len + info->vptr_off, str);
905  if ( irm >=0 )
906  {
907  bcf_info_t tmp = line->d.info[irm]; line->d.info[irm] = line->d.info[i]; line->d.info[i] = tmp;
908  while ( irm<=i && line->d.info[irm].vptr ) irm++;
909  }
910  }
911  if ( irm>=0 ) line->n_info = irm;
912 }
913 
914 static int bcf1_sync(bcf1_t *line)
915 {
916  char *shared_ori = line->shared.s;
917  size_t prev_len;
918 
919  kstring_t tmp = {0,0,0};
920  if ( !line->shared.l )
921  {
922  // New line created via API, BCF data blocks do not exist. Get it ready for BCF output
923  tmp = line->shared;
924  bcf1_sync_id(line, &tmp);
925  line->unpack_size[0] = tmp.l; prev_len = tmp.l;
926 
927  bcf1_sync_alleles(line, &tmp);
928  line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
929 
930  bcf1_sync_filter(line, &tmp);
931  line->unpack_size[2] = tmp.l - prev_len;
932 
933  bcf1_sync_info(line, &tmp);
934  line->shared = tmp;
935  }
936  else if ( line->d.shared_dirty )
937  {
938  // The line was edited, update the BCF data block, ptr_ori points
939  // to the original unchanged BCF data.
940  uint8_t *ptr_ori = (uint8_t *) line->shared.s;
941 
942  assert( line->unpacked & BCF_UN_STR );
943 
944  // ID: single typed string
945  if ( line->d.shared_dirty & BCF1_DIRTY_ID )
946  bcf1_sync_id(line, &tmp);
947  else
948  kputsn_(ptr_ori, line->unpack_size[0], &tmp);
949  ptr_ori += line->unpack_size[0];
950  line->unpack_size[0] = tmp.l; prev_len = tmp.l;
951 
952  // REF+ALT: list of typed strings
953  if ( line->d.shared_dirty & BCF1_DIRTY_ALS )
954  bcf1_sync_alleles(line, &tmp);
955  else
956  {
957  kputsn_(ptr_ori, line->unpack_size[1], &tmp);
958  if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
959  }
960  ptr_ori += line->unpack_size[1];
961  line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
962 
963  if ( line->unpacked & BCF_UN_FLT )
964  {
965  // FILTER: typed vector of integers
966  if ( line->d.shared_dirty & BCF1_DIRTY_FLT )
967  bcf1_sync_filter(line, &tmp);
968  else if ( line->d.n_flt )
969  kputsn_(ptr_ori, line->unpack_size[2], &tmp);
970  else
971  bcf_enc_vint(&tmp, 0, 0, -1);
972  ptr_ori += line->unpack_size[2];
973  line->unpack_size[2] = tmp.l - prev_len;
974 
975  if ( line->unpacked & BCF_UN_INFO )
976  {
977  // INFO: pairs of typed vectors
978  if ( line->d.shared_dirty & BCF1_DIRTY_INF )
979  {
980  bcf1_sync_info(line, &tmp);
981  ptr_ori = (uint8_t*)line->shared.s + line->shared.l;
982  }
983  }
984  }
985 
986  int size = line->shared.l - (size_t)ptr_ori + (size_t)line->shared.s;
987  if ( size ) kputsn_(ptr_ori, size, &tmp);
988 
989  free(line->shared.s);
990  line->shared = tmp;
991  }
992  if ( line->shared.s != shared_ori && line->unpacked & BCF_UN_INFO )
993  {
994  // Reallocated line->shared.s block invalidated line->d.info[].vptr pointers
995  size_t off_new = line->unpack_size[0] + line->unpack_size[1] + line->unpack_size[2];
996  int i;
997  for (i=0; i<line->n_info; i++)
998  {
999  uint8_t *vptr_free = line->d.info[i].vptr_free ? line->d.info[i].vptr - line->d.info[i].vptr_off : NULL;
1000  line->d.info[i].vptr = (uint8_t*) line->shared.s + off_new + line->d.info[i].vptr_off;
1001  off_new += line->d.info[i].vptr_len + line->d.info[i].vptr_off;
1002  if ( vptr_free )
1003  {
1004  free(vptr_free);
1005  line->d.info[i].vptr_free = 0;
1006  }
1007  }
1008  }
1009 
1010  if ( line->n_sample && line->n_fmt && (!line->indiv.l || line->d.indiv_dirty) )
1011  {
1012  // The genotype fields changed or are not present
1013  tmp.l = tmp.m = 0; tmp.s = NULL;
1014  int i, irm = -1;
1015  for (i=0; i<line->n_fmt; i++)
1016  {
1017  bcf_fmt_t *fmt = &line->d.fmt[i];
1018  if ( !fmt->p )
1019  {
1020  // marked for removal
1021  if ( irm < 0 ) irm = i;
1022  continue;
1023  }
1024  kputsn_(fmt->p - fmt->p_off, fmt->p_len + fmt->p_off, &tmp);
1025  if ( irm >=0 )
1026  {
1027  bcf_fmt_t tfmt = line->d.fmt[irm]; line->d.fmt[irm] = line->d.fmt[i]; line->d.fmt[i] = tfmt;
1028  while ( irm<=i && line->d.fmt[irm].p ) irm++;
1029  }
1030 
1031  }
1032  if ( irm>=0 ) line->n_fmt = irm;
1033  free(line->indiv.s);
1034  line->indiv = tmp;
1035 
1036  // Reallocated line->indiv.s block invalidated line->d.fmt[].p pointers
1037  size_t off_new = 0;
1038  for (i=0; i<line->n_fmt; i++)
1039  {
1040  uint8_t *p_free = line->d.fmt[i].p_free ? line->d.fmt[i].p - line->d.fmt[i].p_off : NULL;
1041  line->d.fmt[i].p = (uint8_t*) line->indiv.s + off_new + line->d.fmt[i].p_off;
1042  off_new += line->d.fmt[i].p_len + line->d.fmt[i].p_off;
1043  if ( p_free )
1044  {
1045  free(p_free);
1046  line->d.fmt[i].p_free = 0;
1047  }
1048  }
1049  }
1050  if ( !line->n_sample ) line->n_fmt = 0;
1051  line->d.shared_dirty = line->d.indiv_dirty = 0;
1052  return 0;
1053 }
1054 
1056 {
1057  bcf1_sync(src);
1058 
1059  bcf1_t *out = bcf_init1();
1060 
1061  out->rid = src->rid;
1062  out->pos = src->pos;
1063  out->rlen = src->rlen;
1064  out->qual = src->qual;
1065  out->n_info = src->n_info; out->n_allele = src->n_allele;
1066  out->n_fmt = src->n_fmt; out->n_sample = src->n_sample;
1067 
1068  out->shared.m = out->shared.l = src->shared.l;
1069  out->shared.s = (char*) malloc(out->shared.l);
1070  memcpy(out->shared.s,src->shared.s,out->shared.l);
1071 
1072  out->indiv.m = out->indiv.l = src->indiv.l;
1073  out->indiv.s = (char*) malloc(out->indiv.l);
1074  memcpy(out->indiv.s,src->indiv.s,out->indiv.l);
1075 
1076  return out;
1077 }
1078 
1079 int bcf_write(htsFile *hfp, const bcf_hdr_t *h, bcf1_t *v)
1080 {
1081  if ( bcf_hdr_nsamples(h)!=v->n_sample )
1082  {
1083  fprintf(stderr,"[%s:%d %s] Broken VCF record, the number of columns at %s:%d does not match the number of samples (%d vs %d).\n",
1084  __FILE__,__LINE__,__FUNCTION__,bcf_seqname(h,v),v->pos+1, v->n_sample,bcf_hdr_nsamples(h));
1085  return -1;
1086  }
1087 
1088  if ( !hfp->is_bin ) return vcf_write(hfp,h,v);
1089 
1090  if ( v->errcode )
1091  {
1092  // vcf_parse1() encountered a new contig or tag, undeclared in the
1093  // header. At this point, the header must have been printed,
1094  // proceeding would lead to a broken BCF file. Errors must be checked
1095  // and cleared by the caller before we can proceed.
1096  fprintf(stderr,"[%s:%d %s] Unchecked error (%d), exiting.\n", __FILE__,__LINE__,__FUNCTION__,v->errcode);
1097  exit(1);
1098  }
1099  bcf1_sync(v); // check if the BCF record was modified
1100 
1101  BGZF *fp = hfp->fp.bgzf;
1102  uint32_t x[8];
1103  x[0] = v->shared.l + 24; // to include six 32-bit integers
1104  x[1] = v->indiv.l;
1105  memcpy(x + 2, v, 16);
1106  x[6] = (uint32_t)v->n_allele<<16 | v->n_info;
1107  x[7] = (uint32_t)v->n_fmt<<24 | v->n_sample;
1108  if ( bgzf_write(fp, x, 32) != 32 ) return -1;
1109  if ( bgzf_write(fp, v->shared.s, v->shared.l) != v->shared.l ) return -1;
1110  if ( bgzf_write(fp, v->indiv.s, v->indiv.l) != v->indiv.l ) return -1;
1111  return 0;
1112 }
1113 
1114 /**********************
1115  *** VCF header I/O ***
1116  **********************/
1117 
1119 {
1120  kstring_t txt, *s = &fp->line;
1121  bcf_hdr_t *h;
1122  h = bcf_hdr_init("r");
1123  txt.l = txt.m = 0; txt.s = 0;
1124  while (hts_getline(fp, KS_SEP_LINE, s) >= 0) {
1125  if (s->l == 0) continue;
1126  if (s->s[0] != '#') {
1127  if (hts_verbose >= 2)
1128  fprintf(stderr, "[E::%s] no sample line\n", __func__);
1129  free(txt.s);
1130  bcf_hdr_destroy(h);
1131  return 0;
1132  }
1133  if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here
1134  int dret;
1135  gzFile f;
1136  kstream_t *ks;
1137  kstring_t tmp;
1138  tmp.l = tmp.m = 0; tmp.s = 0;
1139  f = gzopen(fp->fn_aux, "r");
1140  ks = ks_init(f);
1141  while (ks_getuntil(ks, 0, &tmp, &dret) >= 0) {
1142  int c;
1143  kputs("##contig=<ID=", &txt); kputs(tmp.s, &txt);
1144  ks_getuntil(ks, 0, &tmp, &dret);
1145  kputs(",length=", &txt); kputw(atol(tmp.s), &txt);
1146  kputsn(">\n", 2, &txt);
1147  if (dret != '\n')
1148  while ((c = ks_getc(ks)) != '\n' && c != -1); // skip the rest of the line
1149  }
1150  free(tmp.s);
1151  ks_destroy(ks);
1152  gzclose(f);
1153  }
1154  kputsn(s->s, s->l, &txt);
1155  kputc('\n', &txt);
1156  if (s->s[1] != '#') break;
1157  }
1158  if ( !txt.s )
1159  {
1160  fprintf(stderr,"[%s:%d %s] Could not read the header\n", __FILE__,__LINE__,__FUNCTION__);
1161  return NULL;
1162  }
1163  bcf_hdr_parse(h, txt.s);
1164 
1165  // check tabix index, are all contigs listed in the header? add the missing ones
1166  tbx_t *idx = tbx_index_load(fp->fn);
1167  if ( idx )
1168  {
1169  int i, n, need_sync = 0;
1170  const char **names = tbx_seqnames(idx, &n);
1171  for (i=0; i<n; i++)
1172  {
1173  bcf_hrec_t *hrec = bcf_hdr_get_hrec(h, BCF_DT_CTG, (char*) names[i]);
1174  if ( hrec ) continue;
1175  hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
1176  hrec->key = strdup("contig");
1177  bcf_hrec_add_key(hrec, "ID", strlen("ID"));
1178  bcf_hrec_set_val(hrec, hrec->nkeys-1, (char*) names[i], strlen(names[i]), 0);
1179  bcf_hrec_add_key(hrec, "length", strlen("length"));
1180  bcf_hrec_set_val(hrec, hrec->nkeys-1, "2147483647", strlen("2147483647"), 0);
1181  bcf_hdr_add_hrec(h, hrec);
1182  need_sync = 1;
1183  }
1184  free(names);
1185  tbx_destroy(idx);
1186  if ( need_sync )
1187  bcf_hdr_sync(h);
1188  }
1189  free(txt.s);
1190  return h;
1191 }
1192 
1193 int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname)
1194 {
1195  int i, n;
1196  char **lines = hts_readlines(fname, &n);
1197  if ( !lines ) return 1;
1198  for (i=0; i<n-1; i++)
1199  {
1200  int k;
1201  bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,lines[i],&k);
1202  bcf_hdr_add_hrec(hdr, hrec);
1203  free(lines[i]);
1204  }
1205  bcf_hdr_parse_sample_line(hdr,lines[n-1]);
1206  free(lines[n-1]);
1207  free(lines);
1208  bcf_hdr_sync(hdr);
1209  return 0;
1210 }
1211 
1212 static void _bcf_hrec_format(const bcf_hrec_t *hrec, int is_bcf, kstring_t *str)
1213 {
1214  if ( !hrec->value )
1215  {
1216  int j, nout = 0;
1217  ksprintf(str, "##%s=<", hrec->key);
1218  for (j=0; j<hrec->nkeys; j++)
1219  {
1220  // do not output IDX if output is VCF
1221  if ( !is_bcf && !strcmp("IDX",hrec->keys[j]) ) continue;
1222  if ( nout ) kputc(',',str);
1223  ksprintf(str,"%s=%s", hrec->keys[j], hrec->vals[j]);
1224  nout++;
1225  }
1226  ksprintf(str,">\n");
1227  }
1228  else
1229  ksprintf(str,"##%s=%s\n", hrec->key,hrec->value);
1230 }
1231 
1232 void bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str)
1233 {
1234  _bcf_hrec_format(hrec,0,str);
1235 }
1236 char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len)
1237 {
1238  int i;
1239  kstring_t txt = {0,0,0};
1240  for (i=0; i<hdr->nhrec; i++)
1241  _bcf_hrec_format(hdr->hrec[i], is_bcf, &txt);
1242 
1243  ksprintf(&txt,"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO");
1244  if ( bcf_hdr_nsamples(hdr) )
1245  {
1246  ksprintf(&txt,"\tFORMAT");
1247  for (i=0; i<bcf_hdr_nsamples(hdr); i++)
1248  ksprintf(&txt,"\t%s", hdr->samples[i]);
1249  }
1250  ksprintf(&txt,"\n");
1251 
1252  if ( len ) *len = txt.l;
1253  return txt.s;
1254 }
1255 
1256 const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *n)
1257 {
1258  vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
1259  int tid, m = kh_size(d);
1260  const char **names = (const char**) calloc(m,sizeof(const char*));
1261  khint_t k;
1262  for (k=kh_begin(d); k<kh_end(d); k++)
1263  {
1264  if ( !kh_exist(d,k) ) continue;
1265  tid = kh_val(d,k).id;
1266  assert( tid<m );
1267  names[tid] = kh_key(d,k);
1268  }
1269  // sanity check: there should be no gaps
1270  for (tid=0; tid<m; tid++)
1271  assert(names[tid]);
1272  *n = m;
1273  return names;
1274 }
1275 
1276 int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h)
1277 {
1278  int hlen;
1279  char *htxt = bcf_hdr_fmt_text(h, 0, &hlen);
1280  while (hlen && htxt[hlen-1] == 0) --hlen; // kill trailing zeros
1281  int ret;
1282  if ( fp->is_compressed==1 )
1283  ret = bgzf_write(fp->fp.bgzf, htxt, hlen);
1284  else
1285  ret = hwrite(fp->fp.hfile, htxt, hlen);
1286  free(htxt);
1287  return ret<0 ? -1 : 0;
1288 }
1289 
1290 /***********************
1291  *** Typed value I/O ***
1292  ***********************/
1293 
1294 void bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize)
1295 {
1296  int32_t max = INT32_MIN + 1, min = INT32_MAX;
1297  int i;
1298  if (n == 0) bcf_enc_size(s, 0, BCF_BT_NULL);
1299  else if (n == 1) bcf_enc_int1(s, a[0]);
1300  else {
1301  if (wsize <= 0) wsize = n;
1302  for (i = 0; i < n; ++i) {
1303  if (a[i] == bcf_int32_missing || a[i] == bcf_int32_vector_end ) continue;
1304  if (max < a[i]) max = a[i];
1305  if (min > a[i]) min = a[i];
1306  }
1307  if (max <= INT8_MAX && min > bcf_int8_vector_end) {
1308  bcf_enc_size(s, wsize, BCF_BT_INT8);
1309  for (i = 0; i < n; ++i)
1310  if ( a[i]==bcf_int32_vector_end ) kputc(bcf_int8_vector_end, s);
1311  else if ( a[i]==bcf_int32_missing ) kputc(bcf_int8_missing, s);
1312  else kputc(a[i], s);
1313  } else if (max <= INT16_MAX && min > bcf_int16_vector_end) {
1314  bcf_enc_size(s, wsize, BCF_BT_INT16);
1315  for (i = 0; i < n; ++i)
1316  {
1317  int16_t x;
1318  if ( a[i]==bcf_int32_vector_end ) x = bcf_int16_vector_end;
1319  else if ( a[i]==bcf_int32_missing ) x = bcf_int16_missing;
1320  else x = a[i];
1321  kputsn((char*)&x, 2, s);
1322  }
1323  } else {
1324  bcf_enc_size(s, wsize, BCF_BT_INT32);
1325  for (i = 0; i < n; ++i) {
1326  int32_t x = a[i];
1327  kputsn((char*)&x, 4, s);
1328  }
1329  }
1330  }
1331 }
1332 
1333 void bcf_enc_vfloat(kstring_t *s, int n, float *a)
1334 {
1335  bcf_enc_size(s, n, BCF_BT_FLOAT);
1336  kputsn((char*)a, n << 2, s);
1337 }
1338 
1339 void bcf_enc_vchar(kstring_t *s, int l, const char *a)
1340 {
1341  bcf_enc_size(s, l, BCF_BT_CHAR);
1342  kputsn(a, l, s);
1343 }
1344 
1345 void bcf_fmt_array(kstring_t *s, int n, int type, void *data)
1346 {
1347  int j = 0;
1348  if (n == 0) {
1349  kputc('.', s);
1350  return;
1351  }
1352  if (type == BCF_BT_CHAR)
1353  {
1354  char *p = (char*)data;
1355  for (j = 0; j < n && *p; ++j, ++p)
1356  {
1357  if ( *p==bcf_str_missing ) kputc('.', s);
1358  else kputc(*p, s);
1359  }
1360  }
1361  else
1362  {
1363 #define BRANCH(type_t, is_missing, is_vector_end, kprint) { \
1364  type_t *p = (type_t *) data; \
1365  for (j=0; j<n; j++) \
1366  { \
1367  if ( is_vector_end ) break; \
1368  if ( j ) kputc(',', s); \
1369  if ( is_missing ) kputc('.', s); \
1370  else kprint; \
1371  } \
1372 }
1373  switch (type) {
1374  case BCF_BT_INT8: BRANCH(int8_t, p[j]==bcf_int8_missing, p[j]==bcf_int8_vector_end, kputw(p[j], s)); break;
1375  case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, kputw(p[j], s)); break;
1376  case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, kputw(p[j], s)); break;
1377  case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), ksprintf(s, "%g", p[j])); break;
1378  default: fprintf(stderr,"todo: type %d\n", type); exit(1); break;
1379  }
1380 #undef BRANCH
1381 }
1382 }
1383 
1385 {
1386  int x, type;
1387  x = bcf_dec_size(ptr, &ptr, &type);
1388  bcf_fmt_array(s, x, type, ptr);
1389  return ptr + (x << bcf_type_shift[type]);
1390 }
1391 
1392 /********************
1393  *** VCF site I/O ***
1394  ********************/
1395 
1396 typedef struct {
1397  int key, max_m, size, offset;
1398  uint32_t is_gt:1, max_g:15, max_l:16;
1401 } fmt_aux_t;
1402 
1403 static inline void align_mem(kstring_t *s)
1404 {
1405  if (s->l&7) {
1406  uint64_t zero = 0;
1407  int l = ((s->l + 7)>>3<<3) - s->l;
1408  kputsn((char*)&zero, l, s);
1409  }
1410 }
1411 
1412 // p,q is the start and the end of the FORMAT field
1413 int _vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q)
1414 {
1415  if ( !bcf_hdr_nsamples(h) ) return 0;
1416 
1417  char *r, *t;
1418  int j, l, m, g;
1419  khint_t k;
1420  ks_tokaux_t aux1;
1421  vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
1422  kstring_t *mem = (kstring_t*)&h->mem;
1423  mem->l = 0;
1424 
1425  // count the number of format fields
1426  for (r = p, v->n_fmt = 1; *r; ++r)
1427  if (*r == ':') ++v->n_fmt;
1428  char *end = s->s + s->l;
1429  if ( q>=end )
1430  {
1431  fprintf(stderr,"[%s:%d %s] Error: FORMAT column with no sample columns starting at %s:%d\n", __FILE__,__LINE__,__FUNCTION__,s->s,v->pos+1);
1432  return -1;
1433  }
1434 
1435  fmt_aux_t *fmt = (fmt_aux_t*)alloca(v->n_fmt * sizeof(fmt_aux_t));
1436  // get format information from the dictionary
1437  for (j = 0, t = kstrtok(p, ":", &aux1); t; t = kstrtok(0, 0, &aux1), ++j) {
1438  *(char*)aux1.p = 0;
1439  k = kh_get(vdict, d, t);
1440  if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_FMT] == 15) {
1441  fprintf(stderr, "[W::%s] FORMAT '%s' is not defined in the header, assuming Type=String\n", __func__, t);
1442  kstring_t tmp = {0,0,0};
1443  int l;
1444  ksprintf(&tmp, "##FORMAT=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", t);
1445  bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
1446  free(tmp.s);
1447  if ( bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) ) bcf_hdr_sync((bcf_hdr_t*)h);
1448  k = kh_get(vdict, d, t);
1450  }
1451  fmt[j].max_l = fmt[j].max_m = fmt[j].max_g = 0;
1452  fmt[j].key = kh_val(d, k).id;
1453  fmt[j].is_gt = !strcmp(t, "GT");
1454  fmt[j].y = h->id[0][fmt[j].key].val->info[BCF_HL_FMT];
1455  }
1456  // compute max
1457  int n_sample_ori = -1;
1458  r = q + 1; // r: position in the format string
1459  m = l = g = 1, v->n_sample = 0; // m: max vector size, l: max field len, g: max number of alleles
1460  while ( r<end )
1461  {
1462  // can we skip some samples?
1463  if ( h->keep_samples )
1464  {
1465  n_sample_ori++;
1466  if ( !bit_array_test(h->keep_samples,n_sample_ori) )
1467  {
1468  while ( *r!='\t' && r<end ) r++;
1469  if ( *r=='\t' ) { *r = 0; r++; }
1470  continue;
1471  }
1472  }
1473 
1474  // collect fmt stats: max vector size, length, number of alleles
1475  j = 0; // j-th format field
1476  for (;;)
1477  {
1478  if ( *r == '\t' ) *r = 0;
1479  if ( *r == ':' || !*r ) // end of field or end of sample
1480  {
1481  if (fmt[j].max_m < m) fmt[j].max_m = m;
1482  if (fmt[j].max_l < l - 1) fmt[j].max_l = l - 1;
1483  if (fmt[j].is_gt && fmt[j].max_g < g) fmt[j].max_g = g;
1484  l = 0, m = g = 1;
1485  if ( *r==':' ) j++;
1486  else break;
1487  }
1488  else if ( *r== ',' ) m++;
1489  else if ( fmt[j].is_gt && (*r == '|' || *r == '/') ) g++;
1490  if ( r>=end ) break;
1491  r++; l++;
1492  }
1493  v->n_sample++;
1494  if ( v->n_sample == bcf_hdr_nsamples(h) ) break;
1495  r++;
1496  }
1497 
1498  // allocate memory for arrays
1499  for (j = 0; j < v->n_fmt; ++j) {
1500  fmt_aux_t *f = &fmt[j];
1501  if ( !f->max_m ) f->max_m = 1; // omitted trailing format field
1502  if ((f->y>>4&0xf) == BCF_HT_STR) {
1503  f->size = f->is_gt? f->max_g << 2 : f->max_l;
1504  } else if ((f->y>>4&0xf) == BCF_HT_REAL || (f->y>>4&0xf) == BCF_HT_INT) {
1505  f->size = f->max_m << 2;
1506  } else
1507  {
1508  fprintf(stderr, "[E::%s] the format type %d currently not supported\n", __func__, f->y>>4&0xf);
1509  abort(); // I do not know how to do with Flag in the genotype fields
1510  }
1511  align_mem(mem);
1512  f->offset = mem->l;
1513  ks_resize(mem, mem->l + v->n_sample * f->size);
1514  mem->l += v->n_sample * f->size;
1515  }
1516  for (j = 0; j < v->n_fmt; ++j)
1517  fmt[j].buf = (uint8_t*)mem->s + fmt[j].offset;
1518  // fill the sample fields; at beginning of the loop, t points to the first char of a format
1519  n_sample_ori = -1;
1520  t = q + 1; m = 0; // m: sample id
1521  while ( t<end )
1522  {
1523  // can we skip some samples?
1524  if ( h->keep_samples )
1525  {
1526  n_sample_ori++;
1527  if ( !bit_array_test(h->keep_samples,n_sample_ori) )
1528  {
1529  while ( *t && t<end ) t++;
1530  t++;
1531  continue;
1532  }
1533  }
1534  if ( m == bcf_hdr_nsamples(h) ) break;
1535 
1536  j = 0; // j-th format field, m-th sample
1537  while ( *t )
1538  {
1539  fmt_aux_t *z = &fmt[j];
1540  if ((z->y>>4&0xf) == BCF_HT_STR) {
1541  if (z->is_gt) { // genotypes
1542  int32_t is_phased = 0, *x = (int32_t*)(z->buf + z->size * m);
1543  for (l = 0;; ++t) {
1544  if (*t == '.') ++t, x[l++] = is_phased;
1545  else x[l++] = (strtol(t, &t, 10) + 1) << 1 | is_phased;
1546 #if THOROUGH_SANITY_CHECKS
1547  assert( 0 ); // success of strtol,strtod not checked
1548 #endif
1549  is_phased = (*t == '|');
1550  if (*t == ':' || *t == 0) break;
1551  }
1552  if ( !l ) x[l++] = 0; // An empty field, insert missing value
1553  for (; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
1554  } else {
1555  char *x = (char*)z->buf + z->size * m;
1556  for (r = t, l = 0; *t != ':' && *t; ++t) x[l++] = *t;
1557  for (; l < z->size; ++l) x[l] = 0;
1558  }
1559  } else if ((z->y>>4&0xf) == BCF_HT_INT) {
1560  int32_t *x = (int32_t*)(z->buf + z->size * m);
1561  for (l = 0;; ++t) {
1562  if (*t == '.') x[l++] = bcf_int32_missing, ++t; // ++t to skip "."
1563  else x[l++] = strtol(t, &t, 10);
1564  if (*t == ':' || *t == 0) break;
1565  }
1566  if ( !l ) x[l++] = bcf_int32_missing;
1567  for (; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
1568  } else if ((z->y>>4&0xf) == BCF_HT_REAL) {
1569  float *x = (float*)(z->buf + z->size * m);
1570  for (l = 0;; ++t) {
1571  if (*t == '.' && !isdigit(t[1])) bcf_float_set_missing(x[l++]), ++t; // ++t to skip "."
1572  else x[l++] = strtod(t, &t);
1573  if (*t == ':' || *t == 0) break;
1574  }
1575  if ( !l ) bcf_float_set_missing(x[l++]); // An empty field, insert missing value
1576  for (; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]);
1577  } else abort();
1578  if (*t == 0) {
1579  for (++j; j < v->n_fmt; ++j) { // fill end-of-vector values
1580  z = &fmt[j];
1581  if ((z->y>>4&0xf) == BCF_HT_STR) {
1582  if (z->is_gt) {
1583  int32_t *x = (int32_t*)(z->buf + z->size * m);
1584  x[0] = bcf_int32_missing;
1585  for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
1586  } else {
1587  char *x = (char*)z->buf + z->size * m;
1588  if ( z->size ) x[0] = '.';
1589  for (l = 1; l < z->size; ++l) x[l] = 0;
1590  }
1591  } else if ((z->y>>4&0xf) == BCF_HT_INT) {
1592  int32_t *x = (int32_t*)(z->buf + z->size * m);
1593  x[0] = bcf_int32_missing;
1594  for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
1595  } else if ((z->y>>4&0xf) == BCF_HT_REAL) {
1596  float *x = (float*)(z->buf + z->size * m);
1597  bcf_float_set_missing(x[0]);
1598  for (l = 1; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]);
1599  }
1600  }
1601  break;
1602  }
1603  else
1604  {
1605  if (*t == ':') ++j;
1606  t++;
1607  }
1608  }
1609  m++; t++;
1610  }
1611 
1612  // write individual genotype information
1613  kstring_t *str = &v->indiv;
1614  int i;
1615  if (v->n_sample > 0) {
1616  for (i = 0; i < v->n_fmt; ++i) {
1617  fmt_aux_t *z = &fmt[i];
1618  bcf_enc_int1(str, z->key);
1619  if ((z->y>>4&0xf) == BCF_HT_STR && !z->is_gt) {
1620  bcf_enc_size(str, z->size, BCF_BT_CHAR);
1621  kputsn((char*)z->buf, z->size * v->n_sample, str);
1622  } else if ((z->y>>4&0xf) == BCF_HT_INT || z->is_gt) {
1623  bcf_enc_vint(str, (z->size>>2) * v->n_sample, (int32_t*)z->buf, z->size>>2);
1624  } else {
1625  bcf_enc_size(str, z->size>>2, BCF_BT_FLOAT);
1626  kputsn((char*)z->buf, z->size * v->n_sample, str);
1627  }
1628  }
1629  }
1630 
1631  if ( v->n_sample!=bcf_hdr_nsamples(h) )
1632  {
1633  fprintf(stderr,"[%s:%d %s] Number of columns at %s:%d does not match the number of samples (%d vs %d).\n",
1634  __FILE__,__LINE__,__FUNCTION__,bcf_seqname(h,v),v->pos+1, v->n_sample,bcf_hdr_nsamples(h));
1635  v->errcode |= BCF_ERR_NCOLS;
1636  return -1;
1637  }
1638 
1639  return 0;
1640 }
1641 
1642 int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
1643 {
1644  int i = 0;
1645  char *p, *q, *r, *t;
1646  kstring_t *str;
1647  khint_t k;
1648  ks_tokaux_t aux;
1649 
1650  bcf_clear1(v);
1651  str = &v->shared;
1652  memset(&aux, 0, sizeof(ks_tokaux_t));
1653  for (p = kstrtok(s->s, "\t", &aux), i = 0; p; p = kstrtok(0, 0, &aux), ++i) {
1654  q = (char*)aux.p;
1655  *q = 0;
1656  if (i == 0) { // CHROM
1657  vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
1658  k = kh_get(vdict, d, p);
1659  if (k == kh_end(d))
1660  {
1661  // Simple error recovery for chromosomes not defined in the header. It will not help when VCF header has
1662  // been already printed, but will enable tools like vcfcheck to proceed.
1663  fprintf(stderr, "[W::%s] contig '%s' is not defined in the header. (Quick workaround: index the file with tabix.)\n", __func__, p);
1664  kstring_t tmp = {0,0,0};
1665  int l;
1666  ksprintf(&tmp, "##contig=<ID=%s,length=2147483647>", p);
1667  bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
1668  free(tmp.s);
1669  if ( bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) ) bcf_hdr_sync((bcf_hdr_t*)h);
1670  k = kh_get(vdict, d, p);
1672  }
1673  v->rid = kh_val(d, k).id;
1674  } else if (i == 1) { // POS
1675  v->pos = atoi(p) - 1;
1676  } else if (i == 2) { // ID
1677  if (strcmp(p, ".")) bcf_enc_vchar(str, q - p, p);
1678  else bcf_enc_size(str, 0, BCF_BT_CHAR);
1679  } else if (i == 3) { // REF
1680  bcf_enc_vchar(str, q - p, p);
1681  v->n_allele = 1, v->rlen = q - p;
1682  } else if (i == 4) { // ALT
1683  if (strcmp(p, ".")) {
1684  for (r = t = p;; ++r) {
1685  if (*r == ',' || *r == 0) {
1686  bcf_enc_vchar(str, r - t, t);
1687  t = r + 1;
1688  ++v->n_allele;
1689  }
1690  if (r == q) break;
1691  }
1692  }
1693  } else if (i == 5) { // QUAL
1694  if (strcmp(p, ".")) v->qual = atof(p);
1695  else memcpy(&v->qual, &bcf_float_missing, 4);
1696  if ( v->max_unpack && !(v->max_unpack>>1) ) return 0; // BCF_UN_STR
1697  } else if (i == 6) { // FILTER
1698  if (strcmp(p, ".")) {
1699  int32_t *a;
1700  int n_flt = 1, i;
1701  ks_tokaux_t aux1;
1702  vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
1703  // count the number of filters
1704  if (*(q-1) == ';') *(q-1) = 0;
1705  for (r = p; *r; ++r)
1706  if (*r == ';') ++n_flt;
1707  a = (int32_t*)alloca(n_flt * 4);
1708  // add filters
1709  for (t = kstrtok(p, ";", &aux1), i = 0; t; t = kstrtok(0, 0, &aux1)) {
1710  *(char*)aux1.p = 0;
1711  k = kh_get(vdict, d, t);
1712  if (k == kh_end(d))
1713  {
1714  // Simple error recovery for FILTERs not defined in the header. It will not help when VCF header has
1715  // been already printed, but will enable tools like vcfcheck to proceed.
1716  fprintf(stderr, "[W::%s] FILTER '%s' is not defined in the header\n", __func__, t);
1717  kstring_t tmp = {0,0,0};
1718  int l;
1719  ksprintf(&tmp, "##FILTER=<ID=%s,Description=\"Dummy\">", t);
1720  bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
1721  free(tmp.s);
1722  if ( bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) ) bcf_hdr_sync((bcf_hdr_t*)h);
1723  k = kh_get(vdict, d, t);
1725  }
1726  a[i++] = kh_val(d, k).id;
1727  }
1728  n_flt = i;
1729  bcf_enc_vint(str, n_flt, a, -1);
1730  } else bcf_enc_vint(str, 0, 0, -1);
1731  if ( v->max_unpack && !(v->max_unpack>>2) ) return 0; // BCF_UN_FLT
1732  } else if (i == 7) { // INFO
1733  char *key;
1734  vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
1735  v->n_info = 0;
1736  if (strcmp(p, ".")) {
1737  if (*(q-1) == ';') *(q-1) = 0;
1738  for (r = key = p;; ++r) {
1739  int c;
1740  char *val, *end;
1741  if (*r != ';' && *r != '=' && *r != 0) continue;
1742  val = end = 0;
1743  c = *r; *r = 0;
1744  if (c == '=') {
1745  val = r + 1;
1746  for (end = val; *end != ';' && *end != 0; ++end);
1747  c = *end; *end = 0;
1748  } else end = r;
1749  k = kh_get(vdict, d, key);
1750  if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_INFO] == 15)
1751  {
1752  fprintf(stderr, "[W::%s] INFO '%s' is not defined in the header, assuming Type=String\n", __func__, key);
1753  kstring_t tmp = {0,0,0};
1754  int l;
1755  ksprintf(&tmp, "##INFO=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", key);
1756  bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
1757  free(tmp.s);
1758  if ( bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) ) bcf_hdr_sync((bcf_hdr_t*)h);
1759  k = kh_get(vdict, d, key);
1761  }
1762  uint32_t y = kh_val(d, k).info[BCF_HL_INFO];
1763  ++v->n_info;
1764  bcf_enc_int1(str, kh_val(d, k).id);
1765  if (val == 0) {
1766  bcf_enc_size(str, 0, BCF_BT_NULL);
1767  } else if ((y>>4&0xf) == BCF_HT_FLAG || (y>>4&0xf) == BCF_HT_STR) { // if Flag has a value, treat it as a string
1768  bcf_enc_vchar(str, end - val, val);
1769  } else { // int/float value/array
1770  int i, n_val;
1771  char *t, *te;
1772  for (t = val, n_val = 1; *t; ++t) // count the number of values
1773  if (*t == ',') ++n_val;
1774  if ((y>>4&0xf) == BCF_HT_INT) {
1775  int32_t *z;
1776  z = (int32_t*)alloca(n_val<<2);
1777  for (i = 0, t = val; i < n_val; ++i, ++t)
1778  {
1779  z[i] = strtol(t, &te, 10);
1780  if ( te==t ) // conversion failed
1781  {
1782  z[i] = bcf_int32_missing;
1783  while ( *te && *te!=',' ) te++;
1784  }
1785  t = te;
1786  }
1787  bcf_enc_vint(str, n_val, z, -1);
1788  if (strcmp(key, "END") == 0) v->rlen = z[0] - v->pos;
1789  } else if ((y>>4&0xf) == BCF_HT_REAL) {
1790  float *z;
1791  z = (float*)alloca(n_val<<2);
1792  for (i = 0, t = val; i < n_val; ++i, ++t)
1793  {
1794  z[i] = strtod(t, &te);
1795  if ( te==t ) // conversion failed
1796  {
1797  bcf_float_set_missing(z[i]);
1798  while ( *te && *te!=',' ) te++;
1799  }
1800  t = te;
1801  }
1802  bcf_enc_vfloat(str, n_val, z);
1803  }
1804  }
1805  if (c == 0) break;
1806  r = end;
1807  key = r + 1;
1808  }
1809  }
1810  if ( v->max_unpack && !(v->max_unpack>>3) ) return 0;
1811  } else if (i == 8) // FORMAT
1812  return _vcf_parse_format(s, h, v, p, q);
1813  }
1814  return 0;
1815 }
1816 
1817 int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
1818 {
1819  int ret;
1820  ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
1821  if (ret < 0) return -1;
1822  return vcf_parse1(&fp->line, h, v);
1823 }
1824 
1825 static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt)
1826 {
1827  uint8_t *ptr_start = ptr;
1828  fmt->id = bcf_dec_typed_int1(ptr, &ptr);
1829  fmt->n = bcf_dec_size(ptr, &ptr, &fmt->type);
1830  fmt->size = fmt->n << bcf_type_shift[fmt->type];
1831  fmt->p = ptr;
1832  fmt->p_off = ptr - ptr_start;
1833  fmt->p_free = 0;
1834  ptr += n_sample * fmt->size;
1835  fmt->p_len = ptr - fmt->p;
1836  return ptr;
1837 }
1838 
1839 static inline uint8_t *bcf_unpack_info_core1(uint8_t *ptr, bcf_info_t *info)
1840 {
1841  uint8_t *ptr_start = ptr;
1842  info->key = bcf_dec_typed_int1(ptr, &ptr);
1843  info->len = bcf_dec_size(ptr, &ptr, &info->type);
1844  info->vptr = ptr;
1845  info->vptr_off = ptr - ptr_start;
1846  info->vptr_free = 0;
1847  info->v1.i = 0;
1848  if (info->len == 1) {
1849  if (info->type == BCF_BT_INT8 || info->type == BCF_BT_CHAR) info->v1.i = *(int8_t*)ptr;
1850  else if (info->type == BCF_BT_INT32) info->v1.i = *(int32_t*)ptr;
1851  else if (info->type == BCF_BT_FLOAT) info->v1.f = *(float*)ptr;
1852  else if (info->type == BCF_BT_INT16) info->v1.i = *(int16_t*)ptr;
1853  }
1854  ptr += info->len << bcf_type_shift[info->type];
1855  info->vptr_len = ptr - info->vptr;
1856  return ptr;
1857 }
1858 
1859 int bcf_unpack(bcf1_t *b, int which)
1860 {
1861  if ( !b->shared.l ) return 0; // Building a new BCF record from scratch
1862  uint8_t *ptr = (uint8_t*)b->shared.s, *ptr_ori;
1863  int *offset, i;
1864  bcf_dec_t *d = &b->d;
1865  if (which & BCF_UN_FLT) which |= BCF_UN_STR;
1866  if (which & BCF_UN_INFO) which |= BCF_UN_SHR;
1867  if ((which&BCF_UN_STR) && !(b->unpacked&BCF_UN_STR))
1868  {
1869  kstring_t tmp;
1870 
1871  // ID
1872  tmp.l = 0; tmp.s = d->id; tmp.m = d->m_id;
1873  ptr_ori = ptr;
1874  ptr = bcf_fmt_sized_array(&tmp, ptr);
1875  b->unpack_size[0] = ptr - ptr_ori;
1876  kputc('\0', &tmp);
1877  d->id = tmp.s; d->m_id = tmp.m;
1878 
1879  // REF and ALT are in a single block (d->als) and d->alleles are pointers into this block
1880  tmp.l = 0; tmp.s = d->als; tmp.m = d->m_als;
1881  offset = (int*)alloca(b->n_allele * sizeof(int));
1882  ptr_ori = ptr;
1883  for (i = 0; i < b->n_allele; ++i) {
1884  offset[i] = tmp.l;
1885  ptr = bcf_fmt_sized_array(&tmp, ptr);
1886  kputc('\0', &tmp);
1887  }
1888  b->unpack_size[1] = ptr - ptr_ori;
1889  d->als = tmp.s; d->m_als = tmp.m;
1890 
1891  hts_expand(char*, b->n_allele, d->m_allele, d->allele); // NM: hts_expand() is a macro
1892  for (i = 0; i < b->n_allele; ++i)
1893  d->allele[i] = d->als + offset[i];
1894  b->unpacked |= BCF_UN_STR;
1895  }
1896  if ((which&BCF_UN_FLT) && !(b->unpacked&BCF_UN_FLT)) { // FILTER
1897  ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1];
1898  ptr_ori = ptr;
1899  if (*ptr>>4) {
1900  int type;
1901  d->n_flt = bcf_dec_size(ptr, &ptr, &type);
1902  hts_expand(int, d->n_flt, d->m_flt, d->flt);
1903  for (i = 0; i < d->n_flt; ++i)
1904  d->flt[i] = bcf_dec_int1(ptr, type, &ptr);
1905  } else ++ptr, d->n_flt = 0;
1906  b->unpack_size[2] = ptr - ptr_ori;
1907  b->unpacked |= BCF_UN_FLT;
1908  }
1909  if ((which&BCF_UN_INFO) && !(b->unpacked&BCF_UN_INFO)) { // INFO
1910  ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1] + b->unpack_size[2];
1911  hts_expand(bcf_info_t, b->n_info, d->m_info, d->info);
1912  for (i = 0; i < d->m_info; ++i) d->info[i].vptr_free = 0;
1913  for (i = 0; i < b->n_info; ++i)
1914  ptr = bcf_unpack_info_core1(ptr, &d->info[i]);
1915  b->unpacked |= BCF_UN_INFO;
1916  }
1917  if ((which&BCF_UN_FMT) && b->n_sample && !(b->unpacked&BCF_UN_FMT)) { // FORMAT
1918  ptr = (uint8_t*)b->indiv.s;
1919  hts_expand(bcf_fmt_t, b->n_fmt, d->m_fmt, d->fmt);
1920  for (i = 0; i < d->m_fmt; ++i) d->fmt[i].p_free = 0;
1921  for (i = 0; i < b->n_fmt; ++i)
1922  ptr = bcf_unpack_fmt_core1(ptr, b->n_sample, &d->fmt[i]);
1923  b->unpacked |= BCF_UN_FMT;
1924  }
1925  return 0;
1926 }
1927 
1928 int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
1929 {
1930  int i;
1932  kputs(h->id[BCF_DT_CTG][v->rid].key, s); // CHROM
1933  kputc('\t', s); kputw(v->pos + 1, s); // POS
1934  kputc('\t', s); kputs(v->d.id ? v->d.id : ".", s); // ID
1935  kputc('\t', s); // REF
1936  if (v->n_allele > 0) kputs(v->d.allele[0], s);
1937  else kputc('.', s);
1938  kputc('\t', s); // ALT
1939  if (v->n_allele > 1) {
1940  for (i = 1; i < v->n_allele; ++i) {
1941  if (i > 1) kputc(',', s);
1942  kputs(v->d.allele[i], s);
1943  }
1944  } else kputc('.', s);
1945  kputc('\t', s); // QUAL
1946  if (memcmp(&v->qual, &bcf_float_missing, 4) == 0) kputc('.', s); // QUAL
1947  else ksprintf(s, "%g", v->qual);
1948  kputc('\t', s); // FILTER
1949  if (v->d.n_flt) {
1950  for (i = 0; i < v->d.n_flt; ++i) {
1951  if (i) kputc(';', s);
1952  kputs(h->id[BCF_DT_ID][v->d.flt[i]].key, s);
1953  }
1954  } else kputc('.', s);
1955  kputc('\t', s); // INFO
1956  if (v->n_info) {
1957  int first = 1;
1958  for (i = 0; i < v->n_info; ++i) {
1959  bcf_info_t *z = &v->d.info[i];
1960  if ( !z->vptr ) continue;
1961  if ( !first ) kputc(';', s); first = 0;
1962  kputs(h->id[BCF_DT_ID][z->key].key, s);
1963  if (z->len <= 0) continue;
1964  kputc('=', s);
1965  if (z->len == 1) {
1966  if (z->type == BCF_BT_FLOAT) ksprintf(s, "%g", z->v1.f);
1967  else if (z->type != BCF_BT_CHAR) kputw(z->v1.i, s);
1968  else kputc(z->v1.i, s);
1969  } else bcf_fmt_array(s, z->len, z->type, z->vptr);
1970  }
1971  if ( first ) kputc('.', s);
1972  } else kputc('.', s);
1973  // FORMAT and individual information
1974  if (v->n_sample)
1975  {
1976  int i,j;
1977  if ( v->n_fmt)
1978  {
1979  int gt_i = -1;
1980  bcf_fmt_t *fmt = v->d.fmt;
1981  int first = 1;
1982  for (i = 0; i < (int)v->n_fmt; ++i) {
1983  if ( !fmt[i].p ) continue;
1984  kputc(!first ? ':' : '\t', s); first = 0;
1985  if ( fmt[i].id<0 )
1986  {
1987  fprintf(stderr, "[E::%s] invalid BCF, the FORMAT tag id=%d not present in the header.\n", __func__, fmt[i].id);
1988  abort();
1989  }
1990  kputs(h->id[BCF_DT_ID][fmt[i].id].key, s);
1991  if (strcmp(h->id[BCF_DT_ID][fmt[i].id].key, "GT") == 0) gt_i = i;
1992  }
1993  if ( first ) kputs("\t.", s);
1994  for (j = 0; j < v->n_sample; ++j) {
1995  kputc('\t', s);
1996  first = 1;
1997  for (i = 0; i < (int)v->n_fmt; ++i) {
1998  bcf_fmt_t *f = &fmt[i];
1999  if ( !f->p ) continue;
2000  if (!first) kputc(':', s); first = 0;
2001  if (gt_i == i)
2002  bcf_format_gt(f,j,s);
2003  else
2004  bcf_fmt_array(s, f->n, f->type, f->p + j * f->size);
2005  }
2006  if ( first ) kputc('.', s);
2007  }
2008  }
2009  else
2010  for (j=0; j<=v->n_sample; j++)
2011  kputs("\t.", s);
2012  }
2013  kputc('\n', s);
2014  return 0;
2015 }
2016 
2018 {
2019  int ret;
2020  if ( line->s[line->l-1]!='\n' ) kputc('\n',line);
2021  if ( fp->is_compressed==1 )
2022  ret = bgzf_write(fp->fp.bgzf, line->s, line->l);
2023  else
2024  ret = hwrite(fp->fp.hfile, line->s, line->l);
2025  return ret==line->l ? 0 : -1;
2026 }
2027 
2028 int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
2029 {
2030  int ret;
2031  fp->line.l = 0;
2032  vcf_format1(h, v, &fp->line);
2033  if ( fp->is_compressed==1 )
2034  ret = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l);
2035  else
2036  ret = hwrite(fp->fp.hfile, fp->line.s, fp->line.l);
2037  return ret==fp->line.l ? 0 : -1;
2038 }
2039 
2040 /************************
2041  * Data access routines *
2042  ************************/
2043 
2044 int bcf_hdr_id2int(const bcf_hdr_t *h, int which, const char *id)
2045 {
2046  khint_t k;
2047  vdict_t *d = (vdict_t*)h->dict[which];
2048  k = kh_get(vdict, d, id);
2049  return k == kh_end(d)? -1 : kh_val(d, k).id;
2050 }
2051 
2052 
2053 /********************
2054  *** BCF indexing ***
2055  ********************/
2056 
2057 hts_idx_t *bcf_index(htsFile *fp, int min_shift)
2058 {
2059  int n_lvls, i;
2060  bcf1_t *b;
2061  hts_idx_t *idx;
2062  bcf_hdr_t *h;
2063  int64_t max_len = 0, s;
2064  h = bcf_hdr_read(fp);
2065  if ( !h ) return NULL;
2066  int nids = 0;
2067  for (i = 0; i < h->n[BCF_DT_CTG]; ++i)
2068  {
2069  if ( !h->id[BCF_DT_CTG][i].val ) continue;
2070  if ( max_len < h->id[BCF_DT_CTG][i].val->info[0] ) max_len = h->id[BCF_DT_CTG][i].val->info[0];
2071  nids++;
2072  }
2073  if ( !max_len ) max_len = ((int64_t)1<<31) - 1; // In case contig line is broken.
2074  max_len += 256;
2075  for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
2076  idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
2077  b = bcf_init1();
2078  while (bcf_read1(fp,h, b) >= 0) {
2079  int ret;
2080  ret = hts_idx_push(idx, b->rid, b->pos, b->pos + b->rlen, bgzf_tell(fp->fp.bgzf), 1);
2081  if (ret < 0)
2082  {
2083  bcf_destroy1(b);
2084  hts_idx_destroy(idx);
2085  return NULL;
2086  }
2087  }
2088  hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf));
2089  bcf_destroy1(b);
2090  bcf_hdr_destroy(h);
2091  return idx;
2092 }
2093 
2094 int bcf_index_build(const char *fn, int min_shift)
2095 {
2096  htsFile *fp;
2097  hts_idx_t *idx;
2098  if ((fp = hts_open(fn, "rb")) == 0) return -1;
2099  if ( !fp->fp.bgzf->is_compressed ) { hts_close(fp); return -1; }
2100  idx = bcf_index(fp, min_shift);
2101  hts_close(fp);
2102  if ( !idx ) return -1;
2103  hts_idx_save(idx, fn, HTS_FMT_CSI);
2104  hts_idx_destroy(idx);
2105  return 0;
2106 }
2107 
2108 /*****************
2109  *** Utilities ***
2110  *****************/
2111 
2112 void bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src)
2113 {
2114  int i, ndst_ori = dst->nhrec, need_sync = 0;
2115  for (i=0; i<src->nhrec; i++)
2116  {
2117  if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value )
2118  {
2119  int j;
2120  for (j=0; j<ndst_ori; j++)
2121  {
2122  if ( dst->hrec[j]->type!=BCF_HL_GEN ) continue;
2123  if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) && !strcmp(src->hrec[i]->value,dst->hrec[j]->value) ) break;
2124  }
2125  if ( j>=ndst_ori )
2126  need_sync += bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
2127  }
2128  else
2129  {
2130  bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, src->hrec[i]->vals[0]);
2131  if ( !rec )
2132  need_sync += bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
2133  }
2134  }
2135  if ( need_sync ) bcf_hdr_sync(dst);
2136 }
2137 int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *line)
2138 {
2139  int i;
2140  if ( line->errcode )
2141  {
2142  fprintf(stderr,"[%s:%d %s] Unchecked error (%d), exiting.\n", __FILE__,__LINE__,__FUNCTION__,line->errcode);
2143  exit(1);
2144  }
2145  if ( src_hdr->ntransl==-1 ) return 0; // no need to translate, all tags have the same id
2146  if ( !src_hdr->ntransl ) // called for the first time, see what needs translating
2147  {
2148  int dict;
2149  for (dict=0; dict<2; dict++) // BCF_DT_ID and BCF_DT_CTG
2150  {
2151  src_hdr->transl[dict] = (int*) malloc(src_hdr->n[dict]*sizeof(int));
2152  for (i=0; i<src_hdr->n[dict]; i++)
2153  {
2154  if ( i>=dst_hdr->n[dict] || strcmp(src_hdr->id[dict][i].key,dst_hdr->id[dict][i].key) )
2155  {
2156  src_hdr->transl[dict][i] = bcf_hdr_id2int(dst_hdr,dict,src_hdr->id[dict][i].key);
2157  src_hdr->ntransl++;
2158  }
2159  else
2160  src_hdr->transl[dict][i] = -1;
2161  }
2162  }
2163  if ( !src_hdr->ntransl )
2164  {
2165  free(src_hdr->transl[0]); src_hdr->transl[0] = NULL;
2166  free(src_hdr->transl[1]); src_hdr->transl[1] = NULL;
2167  src_hdr->ntransl = -1;
2168  }
2169  if ( src_hdr->ntransl==-1 ) return 0;
2170  }
2171  bcf_unpack(line,BCF_UN_ALL);
2172 
2173  // CHROM
2174  if ( src_hdr->transl[BCF_DT_CTG][line->rid] >=0 ) line->rid = src_hdr->transl[BCF_DT_CTG][line->rid];
2175 
2176  // FILTER
2177  for (i=0; i<line->d.n_flt; i++)
2178  {
2179  int src_id = line->d.flt[i];
2180  if ( src_hdr->transl[BCF_DT_ID][src_id] >=0 )
2181  line->d.flt[i] = src_hdr->transl[BCF_DT_ID][src_id];
2182  line->d.shared_dirty |= BCF1_DIRTY_FLT;
2183  }
2184 
2185  // INFO
2186  for (i=0; i<line->n_info; i++)
2187  {
2188  int src_id = line->d.info[i].key;
2189  int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
2190  if ( dst_id<0 ) continue;
2191  int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
2192  int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
2193  if ( src_size==dst_size ) // can overwrite
2194  {
2195  line->d.info[i].key = dst_id;
2196  uint8_t *vptr = line->d.info[i].vptr - line->d.info[i].vptr_off;
2197  if ( dst_size==BCF_BT_INT8 ) { vptr[1] = (uint8_t)dst_id; }
2198  else if ( dst_size==BCF_BT_INT16 ) { *(uint16_t*)vptr = (uint16_t)dst_id; }
2199  else { *(uint32_t*)vptr = (uint32_t)dst_id; }
2200  }
2201  else // must realloc
2202  {
2203  bcf_info_t *info = &line->d.info[i];
2204  assert( !info->vptr_free );
2205  kstring_t str = {0,0,0};
2206  bcf_enc_int1(&str, dst_id);
2207  bcf_enc_size(&str, info->len,info->type);
2208  info->vptr_off = str.l;
2209  kputsn((char*)info->vptr, info->vptr_len, &str);
2210  info->vptr = (uint8_t*)str.s + info->vptr_off;
2211  info->vptr_free = 1;
2212  info->key = dst_id;
2213  line->d.shared_dirty |= BCF1_DIRTY_INF;
2214  }
2215  }
2216 
2217  // FORMAT
2218  for (i=0; i<line->n_fmt; i++)
2219  {
2220  int src_id = line->d.fmt[i].id;
2221  int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
2222  if ( dst_id<0 ) continue;
2223  int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
2224  int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
2225  if ( src_size==dst_size ) // can overwrite
2226  {
2227  line->d.fmt[i].id = dst_id;
2228  uint8_t *p = line->d.fmt[i].p - line->d.fmt[i].p_off; // pointer to the vector size (4bits) and BT type (4bits)
2229  if ( dst_size==BCF_BT_INT8 ) { p[1] = dst_id; }
2230  else if ( dst_size==BCF_BT_INT16 ) { uint8_t *x = (uint8_t*) &dst_id; p[1] = x[0]; p[2] = x[1]; }
2231  else { uint8_t *x = (uint8_t*) &dst_id; p[1] = x[0]; p[2] = x[1]; p[3] = x[2]; p[4] = x[3]; }
2232  }
2233  else // must realloc
2234  {
2235  bcf_fmt_t *fmt = &line->d.fmt[i];
2236  assert( !fmt->p_free );
2237  kstring_t str = {0,0,0};
2238  bcf_enc_int1(&str, dst_id);
2239  bcf_enc_size(&str, fmt->n, fmt->type);
2240  fmt->p_off = str.l;
2241  kputsn((char*)fmt->p, fmt->p_len, &str);
2242  fmt->p = (uint8_t*)str.s + fmt->p_off;
2243  fmt->p_free = 1;
2244  fmt->id = dst_id;
2245  line->d.indiv_dirty = 1;
2246  }
2247  }
2248  return 0;
2249 }
2250 
2252 {
2253  bcf_hdr_t *hout = bcf_hdr_init("r");
2254  char *htxt = bcf_hdr_fmt_text(hdr, 1, NULL);
2255  bcf_hdr_parse(hout, htxt);
2256  free(htxt);
2257  return hout;
2258 }
2259 
2260 bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap)
2261 {
2262  int hlen;
2263  char *htxt = bcf_hdr_fmt_text(h0, 1, &hlen);
2264  kstring_t str;
2265  bcf_hdr_t *h;
2266  str.l = str.m = 0; str.s = 0;
2267  h = bcf_hdr_init("w");
2269  int j;
2270  for (j=0; j<n; j++) imap[j] = -1;
2271  if ( bcf_hdr_nsamples(h0) > 0) {
2272  char *p;
2273  int i = 0, end = n? 8 : 7;
2274  while ((p = strstr(htxt, "#CHROM\t")) != 0)
2275  if (p > htxt && *(p-1) == '\n') break;
2276  while ((p = strchr(p, '\t')) != 0 && i < end) ++i, ++p;
2277  if (i != end) {
2278  free(h); free(str.s);
2279  return 0; // malformated header
2280  }
2281  kputsn(htxt, p - htxt, &str);
2282  for (i = 0; i < n; ++i) {
2283  imap[i] = bcf_hdr_id2int(h0, BCF_DT_SAMPLE, samples[i]);
2284  if (imap[i] < 0) continue;
2285  kputc('\t', &str);
2286  kputs(samples[i], &str);
2287  }
2288  } else kputsn(htxt, hlen, &str);
2289  while (str.l && (!str.s[str.l-1] || str.s[str.l-1]=='\n') ) str.l--; // kill trailing zeros and newlines
2290  kputc('\n',&str);
2291  bcf_hdr_parse(h, str.s);
2292  free(str.s);
2293  free(htxt);
2294  return h;
2295 }
2296 
2297 int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file)
2298 {
2299  if ( samples && !strcmp("-",samples) ) return 0; // keep all samples
2300 
2301  hdr->nsamples_ori = bcf_hdr_nsamples(hdr);
2302  if ( !samples ) { bcf_hdr_nsamples(hdr) = 0; return 0; } // exclude all samples
2303 
2304  int i, narr = bit_array_size(bcf_hdr_nsamples(hdr));
2305  hdr->keep_samples = (uint8_t*) calloc(narr,1);
2306  if ( samples[0]=='^' )
2307  for (i=0; i<bcf_hdr_nsamples(hdr); i++) bit_array_set(hdr->keep_samples,i);
2308 
2309  int idx, n, ret = 0;
2310  char **smpls = hts_readlist(samples[0]=='^'?samples+1:samples, is_file, &n);
2311  if ( !smpls ) return -1;
2312  for (i=0; i<n; i++)
2313  {
2314  idx = bcf_hdr_id2int(hdr,BCF_DT_SAMPLE,smpls[i]);
2315  if ( idx<0 )
2316  {
2317  if ( !ret ) ret = i+1;
2318  continue;
2319  }
2320  assert( idx<bcf_hdr_nsamples(hdr) );
2321  if ( samples[0]=='^' )
2322  bit_array_clear(hdr->keep_samples, idx);
2323  else
2324  bit_array_set(hdr->keep_samples, idx);
2325  }
2326  for (i=0; i<n; i++) free(smpls[i]);
2327  free(smpls);
2328 
2329  bcf_hdr_nsamples(hdr) = 0;
2330  for (i=0; i<hdr->nsamples_ori; i++)
2331  if ( bit_array_test(hdr->keep_samples,i) ) bcf_hdr_nsamples(hdr)++;
2332  if ( !bcf_hdr_nsamples(hdr) ) { free(hdr->keep_samples); hdr->keep_samples=NULL; }
2333  else
2334  {
2335  char **samples = (char**) malloc(sizeof(char*)*bcf_hdr_nsamples(hdr));
2336  idx = 0;
2337  for (i=0; i<hdr->nsamples_ori; i++)
2338  if ( bit_array_test(hdr->keep_samples,i) ) samples[idx++] = strdup(hdr->samples[i]);
2339  free(hdr->samples);
2340  hdr->samples = samples;
2341 
2342  // delete original samples from the dictionary
2343  vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE];
2344  int k;
2345  for (k = kh_begin(d); k != kh_end(d); ++k)
2346  if (kh_exist(d, k)) free((char*)kh_key(d, k));
2347  kh_destroy(vdict, d);
2348 
2349  // add the subset back
2350  hdr->dict[BCF_DT_SAMPLE] = d = kh_init(vdict);
2351  for (i=0; i<bcf_hdr_nsamples(hdr); i++)
2352  {
2353  int ignore, k = kh_put(vdict, d, hdr->samples[i], &ignore);
2354  kh_val(d, k) = bcf_idinfo_def;
2355  kh_val(d, k).id = kh_size(d) - 1;
2356  }
2357  bcf_hdr_sync(hdr);
2358  }
2359 
2360  return ret;
2361 }
2362 
2363 int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap)
2364 {
2365  kstring_t ind;
2366  ind.s = 0; ind.l = ind.m = 0;
2367  if (n) {
2368  bcf_fmt_t *fmt;
2369  int i, j;
2370  fmt = (bcf_fmt_t*)alloca(v->n_fmt * sizeof(bcf_fmt_t));
2371  uint8_t *ptr = (uint8_t*)v->indiv.s;
2372  for (i = 0; i < v->n_fmt; ++i)
2373  ptr = bcf_unpack_fmt_core1(ptr, v->n_sample, &fmt[i]);
2374  for (i = 0; i < (int)v->n_fmt; ++i) {
2375  bcf_fmt_t *f = &fmt[i];
2376  bcf_enc_int1(&ind, f->id);
2377  bcf_enc_size(&ind, f->n, f->type);
2378  for (j = 0; j < n; ++j)
2379  if (imap[j] >= 0) kputsn((char*)(f->p + imap[j] * f->size), f->size, &ind);
2380  }
2381  for (i = j = 0; j < n; ++j) if (imap[j] >= 0) ++i;
2382  v->n_sample = i;
2383  } else v->n_sample = 0;
2384  if ( !v->n_sample ) v->n_fmt = 0;
2385  free(v->indiv.s);
2386  v->indiv = ind;
2387  v->unpacked &= ~BCF_UN_FMT; // only BCF is ready for output, VCF will need to unpack again
2388  return 0;
2389 }
2390 
2392 {
2393  int i;
2394  bcf_unpack(v, BCF_UN_STR);
2395  for (i = 0; i < v->n_allele; ++i)
2396  if (strlen(v->d.allele[i]) != 1) break;
2397  return i == v->n_allele;
2398 }
2399 
2400 static void bcf_set_variant_type(const char *ref, const char *alt, variant_t *var)
2401 {
2402  // The most frequent case
2403  if ( !ref[1] && !alt[1] )
2404  {
2405  if ( *alt == '.' || *ref==*alt ) { var->n = 0; var->type = VCF_REF; return; }
2406  if ( *alt == 'X' ) { var->n = 0; var->type = VCF_REF; return; } // mpileup's X allele shouldn't be treated as variant
2407  var->n = 1; var->type = VCF_SNP; return;
2408  }
2409 
2410  const char *r = ref, *a = alt;
2411  while (*r && *a && *r==*a ) { r++; a++; }
2412 
2413  if ( *a && !*r )
2414  {
2415  while ( *a ) a++;
2416  var->n = (a-alt)-(r-ref); var->type = VCF_INDEL; return;
2417  }
2418  else if ( *r && !*a )
2419  {
2420  while ( *r ) r++;
2421  var->n = (a-alt)-(r-ref); var->type = VCF_INDEL; return;
2422  }
2423  else if ( !*r && !*a )
2424  {
2425  var->n = 0; var->type = VCF_REF; return;
2426  }
2427 
2428  const char *re = r, *ae = a;
2429  while ( re[1] ) re++;
2430  while ( ae[1] ) ae++;
2431  while ( *re==*ae && re>r && ae>a ) { re--; ae--; }
2432  if ( ae==a )
2433  {
2434  if ( re==r ) { var->n = 1; var->type = VCF_SNP; return; }
2435  var->n = -(re-r);
2436  if ( *re==*ae ) { var->type = VCF_INDEL; return; }
2437  var->type = VCF_OTHER; return;
2438  }
2439  else if ( re==r )
2440  {
2441  var->n = ae-a;
2442  if ( *re==*ae ) { var->type = VCF_INDEL; return; }
2443  var->type = VCF_OTHER; return;
2444  }
2445 
2446  var->type = ( re-r == ae-a ) ? VCF_MNP : VCF_OTHER;
2447  var->n = ( re-r > ae-a ) ? -(re-r+1) : ae-a+1;
2448 
2449  // should do also complex events, SVs, etc...
2450 }
2451 
2452 static void bcf_set_variant_types(bcf1_t *b)
2453 {
2454  if ( !(b->unpacked & BCF_UN_STR) ) bcf_unpack(b, BCF_UN_STR);
2455  bcf_dec_t *d = &b->d;
2456  if ( d->n_var < b->n_allele )
2457  {
2458  d->var = (variant_t *) realloc(d->var, sizeof(variant_t)*b->n_allele);
2459  d->n_var = b->n_allele;
2460  }
2461  int i;
2462  b->d.var_type = 0;
2463  for (i=1; i<b->n_allele; i++)
2464  {
2465  bcf_set_variant_type(d->allele[0],d->allele[i], &d->var[i]);
2466  b->d.var_type |= d->var[i].type;
2467  //fprintf(stderr,"[set_variant_type] %d %s %s -> %d %d .. %d\n", b->pos+1,d->allele[0],d->allele[i],d->var[i].type,d->var[i].n, b->d.var_type);
2468  }
2469 }
2470 
2472 {
2473  if ( rec->d.var_type==-1 ) bcf_set_variant_types(rec);
2474  return rec->d.var_type;
2475 }
2476 int bcf_get_variant_type(bcf1_t *rec, int ith_allele)
2477 {
2478  if ( rec->d.var_type==-1 ) bcf_set_variant_types(rec);
2479  return rec->d.var[ith_allele].type;
2480 }
2481 
2482 int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
2483 {
2484  // Is the field already present?
2485  int i, inf_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key);
2486  if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,inf_id) ) return -1; // No such INFO field in the header
2487  if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
2488 
2489  for (i=0; i<line->n_info; i++)
2490  if ( inf_id==line->d.info[i].key ) break;
2491  bcf_info_t *inf = i==line->n_info ? NULL : &line->d.info[i];
2492 
2493  if ( !n || (type==BCF_HT_STR && !values) )
2494  {
2495  if ( inf )
2496  {
2497  // Mark the tag for removal, free existing memory if necessary
2498  if ( inf->vptr_free )
2499  {
2500  free(inf->vptr - inf->vptr_off);
2501  inf->vptr_free = 0;
2502  }
2503  line->d.shared_dirty |= BCF1_DIRTY_INF;
2504  inf->vptr = NULL;
2505  }
2506  return 0;
2507  }
2508 
2509  // Encode the values and determine the size required to accommodate the values
2510  kstring_t str = {0,0,0};
2511  bcf_enc_int1(&str, inf_id);
2512  if ( type==BCF_HT_INT )
2513  bcf_enc_vint(&str, n, (int32_t*)values, -1);
2514  else if ( type==BCF_HT_REAL )
2515  bcf_enc_vfloat(&str, n, (float*)values);
2516  else if ( type==BCF_HT_FLAG || type==BCF_HT_STR )
2517  {
2518  if ( values==NULL )
2519  bcf_enc_size(&str, 0, BCF_BT_NULL);
2520  else
2521  bcf_enc_vchar(&str, strlen((char*)values), (char*)values);
2522  }
2523  else
2524  {
2525  fprintf(stderr, "[E::%s] the type %d not implemented yet\n", __func__, type);
2526  abort();
2527  }
2528 
2529  // Is the INFO tag already present
2530  if ( inf )
2531  {
2532  // Is it big enough to accommodate new block?
2533  if ( str.l <= inf->vptr_len + inf->vptr_off )
2534  {
2535  if ( str.l != inf->vptr_len + inf->vptr_off ) line->d.shared_dirty |= BCF1_DIRTY_INF;
2536  uint8_t *ptr = inf->vptr - inf->vptr_off;
2537  memcpy(ptr, str.s, str.l);
2538  free(str.s);
2539  int vptr_free = inf->vptr_free;
2540  bcf_unpack_info_core1(ptr, inf);
2541  inf->vptr_free = vptr_free;
2542  }
2543  else
2544  {
2545  assert( !inf->vptr_free ); // fix the caller or improve here: this has been modified before
2546  bcf_unpack_info_core1((uint8_t*)str.s, inf);
2547  inf->vptr_free = 1;
2548  line->d.shared_dirty |= BCF1_DIRTY_INF;
2549  }
2550  }
2551  else
2552  {
2553  // The tag is not present, create new one
2554  line->n_info++;
2555  hts_expand0(bcf_info_t, line->n_info, line->d.m_info , line->d.info);
2556  inf = &line->d.info[line->n_info-1];
2557  bcf_unpack_info_core1((uint8_t*)str.s, inf);
2558  inf->vptr_free = 1;
2559  line->d.shared_dirty |= BCF1_DIRTY_INF;
2560  }
2561  line->unpacked |= BCF_UN_INFO;
2562  return 0;
2563 }
2564 
2565 int bcf_update_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char **values, int n)
2566 {
2567  if ( !n )
2568  return bcf_update_format(hdr,line,key,NULL,0,BCF_HT_STR);
2569 
2570  int i, max_len = 0;
2571  for (i=0; i<n; i++)
2572  {
2573  int len = strlen(values[i]);
2574  if ( len > max_len ) max_len = len;
2575  }
2576  char *out = (char*) malloc(max_len*n);
2577  if ( !out ) return -2;
2578  for (i=0; i<n; i++)
2579  {
2580  char *dst = out+i*max_len;
2581  const char *src = values[i];
2582  int j = 0;
2583  while ( src[j] ) { dst[j] = src[j]; j++; }
2584  for (; j<max_len; j++) dst[j] = 0;
2585  }
2586  int ret = bcf_update_format(hdr,line,key,out,max_len*n,BCF_HT_STR);
2587  free(out);
2588  return ret;
2589 }
2590 
2591 int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
2592 {
2593  // Is the field already present?
2594  int i, fmt_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key);
2595  if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,fmt_id) )
2596  {
2597  if ( !n ) return 0;
2598  return -1; // the key not present in the header
2599  }
2600 
2601  if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
2602 
2603  for (i=0; i<line->n_fmt; i++)
2604  if ( line->d.fmt[i].id==fmt_id ) break;
2605  bcf_fmt_t *fmt = i==line->n_fmt ? NULL : &line->d.fmt[i];
2606 
2607  if ( !n )
2608  {
2609  if ( fmt )
2610  {
2611  // Mark the tag for removal, free existing memory if necessary
2612  if ( fmt->p_free )
2613  {
2614  free(fmt->p - fmt->p_off);
2615  fmt->p_free = 0;
2616  }
2617  line->d.indiv_dirty = 1;
2618  fmt->p = NULL;
2619  }
2620  return 0;
2621  }
2622 
2623  line->n_sample = bcf_hdr_nsamples(hdr);
2624  int nps = n / line->n_sample; // number of values per sample
2625  assert( nps && nps*line->n_sample==n ); // must be divisible by n_sample
2626 
2627  // Encode the values and determine the size required to accommodate the values
2628  kstring_t str = {0,0,0};
2629  bcf_enc_int1(&str, fmt_id);
2630  if ( type==BCF_HT_INT )
2631  bcf_enc_vint(&str, n, (int32_t*)values, nps);
2632  else if ( type==BCF_HT_REAL )
2633  {
2634  bcf_enc_size(&str, nps, BCF_BT_FLOAT);
2635  kputsn((char*)values, nps*line->n_sample*sizeof(float), &str);
2636  }
2637  else if ( type==BCF_HT_STR )
2638  {
2639  bcf_enc_size(&str, nps, BCF_BT_CHAR);
2640  kputsn((char*)values, nps*line->n_sample, &str);
2641  }
2642  else
2643  {
2644  fprintf(stderr, "[E::%s] the type %d not implemented yet\n", __func__, type);
2645  abort();
2646  }
2647 
2648  if ( !fmt )
2649  {
2650  // Not present, new format field
2651  line->n_fmt++;
2652  hts_expand0(bcf_fmt_t, line->n_fmt, line->d.m_fmt, line->d.fmt);
2653 
2654  // Special case: VCF specification requires that GT is always first
2655  if ( line->n_fmt > 1 && key[0]=='G' && key[1]=='T' && !key[2] )
2656  {
2657  for (i=line->n_fmt-1; i>0; i--)
2658  line->d.fmt[i] = line->d.fmt[i-1];
2659  fmt = &line->d.fmt[0];
2660  }
2661  else
2662  fmt = &line->d.fmt[line->n_fmt-1];
2663  bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt);
2664  line->d.indiv_dirty = 1;
2665  fmt->p_free = 1;
2666  }
2667  else
2668  {
2669  // The tag is already present, check if it is big enough to accomodate the new block
2670  if ( str.l <= fmt->p_len + fmt->p_off )
2671  {
2672  // good, the block is big enough
2673  if ( str.l != fmt->p_len + fmt->p_off ) line->d.indiv_dirty = 1;
2674  uint8_t *ptr = fmt->p - fmt->p_off;
2675  memcpy(ptr, str.s, str.l);
2676  free(str.s);
2677  int p_free = fmt->p_free;
2678  bcf_unpack_fmt_core1(ptr, line->n_sample, fmt);
2679  fmt->p_free = p_free;
2680  }
2681  else
2682  {
2683  assert( !fmt->p_free ); // fix the caller or improve here: this has been modified before
2684  bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt);
2685  fmt->p_free = 1;
2686  line->d.indiv_dirty = 1;
2687  }
2688  }
2689  line->unpacked |= BCF_UN_FMT;
2690  return 0;
2691 }
2692 
2693 
2694 int bcf_update_filter(const bcf_hdr_t *hdr, bcf1_t *line, int *flt_ids, int n)
2695 {
2696  if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
2697  line->d.shared_dirty |= BCF1_DIRTY_FLT;
2698  line->d.n_flt = n;
2699  if ( !n ) return 0;
2700  hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt);
2701  int i;
2702  for (i=0; i<n; i++)
2703  line->d.flt[i] = flt_ids[i];
2704  return 0;
2705 }
2706 
2707 int bcf_add_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id)
2708 {
2709  if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
2710  int i;
2711  for (i=0; i<line->d.n_flt; i++)
2712  if ( flt_id==line->d.flt[i] ) break;
2713  if ( i<line->d.n_flt ) return 0; // this filter is already set
2714  line->d.shared_dirty |= BCF1_DIRTY_FLT;
2715  if ( flt_id==0 ) // set to PASS
2716  line->d.n_flt = 1;
2717  else if ( line->d.n_flt==1 && line->d.flt[0]==0 )
2718  line->d.n_flt = 1;
2719  else
2720  line->d.n_flt++;
2721  hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt);
2722  line->d.flt[line->d.n_flt-1] = flt_id;
2723  return 1;
2724 }
2725 int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int pass)
2726 {
2727  if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
2728  int i;
2729  for (i=0; i<line->d.n_flt; i++)
2730  if ( flt_id==line->d.flt[i] ) break;
2731  if ( i==line->d.n_flt ) return 0; // the filter is not present
2732  line->d.shared_dirty |= BCF1_DIRTY_FLT;
2733  if ( i!=line->d.n_flt-1 ) memmove(line->d.flt+i,line->d.flt+i+1,line->d.n_flt-i);
2734  line->d.n_flt--;
2735  if ( !line->d.n_flt && pass ) bcf_add_filter(hdr,line,0);
2736  return 0;
2737 }
2738 
2739 int bcf_has_filter(const bcf_hdr_t *hdr, bcf1_t *line, char *filter)
2740 {
2741  if ( filter[0]=='.' && !filter[1] ) filter = "PASS";
2742  int id = bcf_hdr_id2int(hdr, BCF_DT_ID, filter);
2743  if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FLT,id) ) return -1; // not defined in the header
2744 
2745  if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
2746  if ( id==0 && !line->d.n_flt) return 1; // PASS
2747 
2748  int i;
2749  for (i=0; i<line->d.n_flt; i++)
2750  if ( line->d.flt[i]==id ) return 1;
2751  return 0;
2752 }
2753 
2754 static inline int _bcf1_sync_alleles(const bcf_hdr_t *hdr, bcf1_t *line, int nals)
2755 {
2756  line->d.shared_dirty |= BCF1_DIRTY_ALS;
2757 
2758  line->n_allele = nals;
2759  hts_expand(char*, line->n_allele, line->d.m_allele, line->d.allele);
2760 
2761  char *als = line->d.als;
2762  int n = 0;
2763  while (n<nals)
2764  {
2765  line->d.allele[n] = als;
2766  while ( *als ) als++;
2767  als++;
2768  n++;
2769  }
2770  return 0;
2771 }
2772 int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals)
2773 {
2774  kstring_t tmp = {0,0,0};
2775  char *free_old = NULL;
2776 
2777  // If the supplied alleles are not pointers to line->d.als, the existing block can be reused.
2778  int i;
2779  for (i=0; i<nals; i++)
2780  if ( alleles[i]>=line->d.als && alleles[i]<line->d.als+line->d.m_als ) break;
2781  if ( i==nals )
2782  {
2783  // all alleles point elsewhere, reuse the existing block
2784  tmp.l = 0; tmp.s = line->d.als; tmp.m = line->d.m_als;
2785  }
2786  else
2787  free_old = line->d.als;
2788 
2789  for (i=0; i<nals; i++)
2790  {
2791  kputs(alleles[i], &tmp);
2792  kputc(0, &tmp);
2793  }
2794  line->d.als = tmp.s; line->d.m_als = tmp.m;
2795  free(free_old);
2796  return _bcf1_sync_alleles(hdr,line,nals);
2797 }
2798 
2799 int bcf_update_alleles_str(const bcf_hdr_t *hdr, bcf1_t *line, const char *alleles_string)
2800 {
2801  kstring_t tmp;
2802  tmp.l = 0; tmp.s = line->d.als; tmp.m = line->d.m_als;
2803  kputs(alleles_string, &tmp);
2804  line->d.als = tmp.s; line->d.m_als = tmp.m;
2805 
2806  int nals = 1;
2807  char *t = line->d.als;
2808  while (*t)
2809  {
2810  if ( *t==',' ) { *t = 0; nals++; }
2811  t++;
2812  }
2813  return _bcf1_sync_alleles(hdr, line, nals);
2814 }
2815 
2816 int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
2817 {
2818  kstring_t tmp;
2819  tmp.l = 0; tmp.s = line->d.id; tmp.m = line->d.m_id;
2820  if ( id )
2821  kputs(id, &tmp);
2822  else
2823  kputs(".", &tmp);
2824  line->d.id = tmp.s; line->d.m_id = tmp.m;
2825  line->d.shared_dirty |= BCF1_DIRTY_ID;
2826  return 0;
2827 }
2828 
2829 bcf_fmt_t *bcf_get_fmt(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
2830 {
2831  int i, id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
2832  if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) ) return NULL; // no such FMT field in the header
2833  if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
2834  for (i=0; i<line->n_fmt; i++)
2835  {
2836  if ( line->d.fmt[i].id==id ) return &line->d.fmt[i];
2837  }
2838  return NULL;
2839 }
2840 
2841 bcf_info_t *bcf_get_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
2842 {
2843  int i, id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
2844  if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,id) ) return NULL; // no such INFO field in the header
2845  if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
2846  for (i=0; i<line->n_info; i++)
2847  {
2848  if ( line->d.info[i].key==id ) return &line->d.info[i];
2849  }
2850  return NULL;
2851 }
2852 
2853 int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
2854 {
2855  int i,j, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
2856  if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,tag_id) ) return -1; // no such INFO field in the header
2857  if ( bcf_hdr_id2type(hdr,BCF_HL_INFO,tag_id)!=type ) return -2; // expected different type
2858 
2859  if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
2860 
2861  for (i=0; i<line->n_info; i++)
2862  if ( line->d.info[i].key==tag_id ) break;
2863  if ( i==line->n_info ) return ( type==BCF_HT_FLAG ) ? 0 : -3; // the tag is not present in this record
2864  if ( type==BCF_HT_FLAG ) return 1;
2865 
2866  bcf_info_t *info = &line->d.info[i];
2867  if ( type==BCF_HT_STR )
2868  {
2869  if ( *ndst < info->len+1 )
2870  {
2871  *ndst = info->len + 1;
2872  *dst = realloc(*dst, *ndst);
2873  }
2874  memcpy(*dst,info->vptr,info->len);
2875  ((uint8_t*)*dst)[info->len] = 0;
2876  return info->len;
2877  }
2878 
2879  // Make sure the buffer is big enough
2880  int size1 = type==BCF_HT_INT ? sizeof(int32_t) : sizeof(float);
2881  if ( *ndst < info->len )
2882  {
2883  *ndst = info->len;
2884  *dst = realloc(*dst, *ndst * size1);
2885  }
2886 
2887  if ( info->len == 1 )
2888  {
2889  if ( info->type==BCF_BT_FLOAT ) *((float*)*dst) = info->v1.f;
2890  else *((int32_t*)*dst) = info->v1.i;
2891  return 1;
2892  }
2893 
2894 #define BRANCH(type_t, is_missing, is_vector_end, set_missing, out_type_t) { \
2895  out_type_t *tmp = (out_type_t *) *dst; \
2896  type_t *p = (type_t *) info->vptr; \
2897  for (j=0; j<info->len; j++) \
2898  { \
2899  if ( is_vector_end ) return j; \
2900  if ( is_missing ) set_missing; \
2901  else *tmp = p[j]; \
2902  tmp++; \
2903  } \
2904  return j; \
2905 }
2906 switch (info->type) {
2910  case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), bcf_float_set_missing(*tmp), float); break;
2911  default: fprintf(stderr,"TODO: %s:%d .. info->type=%d\n", __FILE__,__LINE__, info->type); exit(1);
2912 }
2913 #undef BRANCH
2914 return -4; // this can never happen
2915 }
2916 
2917 int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst)
2918 {
2919  int i,tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
2920  if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1; // no such FORMAT field in the header
2921  if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2; // expected different type
2922 
2923  if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
2924 
2925  for (i=0; i<line->n_fmt; i++)
2926  if ( line->d.fmt[i].id==tag_id ) break;
2927  if ( i==line->n_fmt ) return -3; // the tag is not present in this record
2928  bcf_fmt_t *fmt = &line->d.fmt[i];
2929 
2930  int nsmpl = bcf_hdr_nsamples(hdr);
2931  if ( !*dst )
2932  {
2933  *dst = (char**) malloc(sizeof(char*)*nsmpl);
2934  if ( !*dst ) return -4; // could not alloc
2935  (*dst)[0] = NULL;
2936  }
2937  int n = (fmt->n+1)*nsmpl;
2938  if ( *ndst < n )
2939  {
2940  (*dst)[0] = realloc((*dst)[0], n);
2941  if ( !(*dst)[0] ) return -4; // could not alloc
2942  *ndst = n;
2943  }
2944  for (i=0; i<nsmpl; i++)
2945  {
2946  uint8_t *src = fmt->p + i*fmt->n;
2947  uint8_t *tmp = (uint8_t*)(*dst)[0] + i*(fmt->n+1);
2948  memcpy(tmp,src,fmt->n);
2949  tmp[fmt->n] = 0;
2950  (*dst)[i] = (char*) tmp;
2951  }
2952  return n;
2953 }
2954 
2955 int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
2956 {
2957  int i,j, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
2958  if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1; // no such FORMAT field in the header
2959  if ( tag[0]=='G' && tag[1]=='T' && tag[2]==0 )
2960  {
2961  // Ugly: GT field is considered to be a string by the VCF header but BCF represents it as INT.
2962  if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2;
2963  }
2964  else if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=type ) return -2; // expected different type
2965 
2966  if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
2967 
2968  for (i=0; i<line->n_fmt; i++)
2969  if ( line->d.fmt[i].id==tag_id ) break;
2970  if ( i==line->n_fmt ) return -3; // the tag is not present in this record
2971  bcf_fmt_t *fmt = &line->d.fmt[i];
2972 
2973  if ( type==BCF_HT_STR )
2974  {
2975  int n = fmt->n*bcf_hdr_nsamples(hdr);
2976  if ( *ndst < n )
2977  {
2978  *dst = realloc(*dst, n);
2979  if ( !*dst ) return -4; // could not alloc
2980  *ndst = n;
2981  }
2982  memcpy(*dst,fmt->p,n);
2983  return n;
2984  }
2985 
2986  // Make sure the buffer is big enough
2987  int nsmpl = bcf_hdr_nsamples(hdr);
2988  int size1 = type==BCF_HT_INT ? sizeof(int32_t) : sizeof(float);
2989  if ( *ndst < fmt->n*nsmpl )
2990  {
2991  *ndst = fmt->n*nsmpl;
2992  *dst = realloc(*dst, *ndst*size1);
2993  if ( !dst ) return -4; // could not alloc
2994  }
2995 
2996 #define BRANCH(type_t, is_missing, is_vector_end, set_missing, set_vector_end, out_type_t) { \
2997  out_type_t *tmp = (out_type_t *) *dst; \
2998  type_t *p = (type_t*) fmt->p; \
2999  for (i=0; i<nsmpl; i++) \
3000  { \
3001  for (j=0; j<fmt->n; j++) \
3002  { \
3003  if ( is_missing ) set_missing; \
3004  else if ( is_vector_end ) { set_vector_end; break; } \
3005  else *tmp = p[j]; \
3006  tmp++; \
3007  } \
3008  for (; j<fmt->n; j++) { set_vector_end; tmp++; } \
3009  p = (type_t *)((char *)p + fmt->size); \
3010  } \
3011 }
3012 switch (fmt->type) {
3016  case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), bcf_float_set_missing(*tmp), bcf_float_set_vector_end(*tmp), float); break;
3017  default: fprintf(stderr,"TODO: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt->type); exit(1);
3018 }
3019 #undef BRANCH
3020 return nsmpl*fmt->n;
3021 }
3022