NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
vcf.h
Go to the documentation of this file.
1 /*
2  todo:
3  - make the function names consistent
4  - provide calls to abstract away structs as much as possible
5  */
6 
7 #ifndef BCF_H
8 #define BCF_H
9 
10 #include <stdint.h>
11 #include <limits.h>
12 #include <assert.h>
13 #include "hts.h"
14 #include "kstring.h"
15 
16 
17 /*****************
18  * Header struct *
19  *****************/
20 
21 #define BCF_HL_FLT 0 // header line
22 #define BCF_HL_INFO 1
23 #define BCF_HL_FMT 2
24 #define BCF_HL_CTG 3
25 #define BCF_HL_STR 4 // structured header line TAG=<A=..,B=..>
26 #define BCF_HL_GEN 5 // generic header line
27 
28 #define BCF_HT_FLAG 0 // header type
29 #define BCF_HT_INT 1
30 #define BCF_HT_REAL 2
31 #define BCF_HT_STR 3
32 
33 #define BCF_VL_FIXED 0 // variable length
34 #define BCF_VL_VAR 1
35 #define BCF_VL_A 2
36 #define BCF_VL_G 3
37 #define BCF_VL_R 4
38 
39 /* === Dictionary ===
40 
41  The header keeps three dictonaries. The first keeps IDs in the
42  "FILTER/INFO/FORMAT" lines, the second keeps the sequence names and lengths
43  in the "contig" lines and the last keeps the sample names. bcf_hdr_t::dict[]
44  is the actual hash table, which is opaque to the end users. In the hash
45  table, the key is the ID or sample name as a C string and the value is a
46  bcf_idinfo_t struct. bcf_hdr_t::id[] points to key-value pairs in the hash
47  table in the order that they appear in the VCF header. bcf_hdr_t::n[] is the
48  size of the hash table or, equivalently, the length of the id[] arrays.
49 */
50 
51 #define BCF_DT_ID 0 // dictionary type
52 #define BCF_DT_CTG 1
53 #define BCF_DT_SAMPLE 2
54 
55 // Complete textual representation of a header line
56 typedef struct {
57  int type; // One of the BCF_HL_* type
58  char *key; // The part before '=', i.e. FILTER/INFO/FORMAT/contig/fileformat etc.
59  char *value; // Set only for generic lines, NULL for FILTER/INFO, etc.
60  int nkeys; // Number of structured fields
61  char **keys, **vals; // The key=value pairs
62 } bcf_hrec_t;
63 
64 typedef struct {
65  uint32_t info[3]; // stores Number:20, var:4, Type:4, ColType:4 for BCF_HL_FLT,INFO,FMT
66  bcf_hrec_t *hrec[3];
67  int id;
68 } bcf_idinfo_t;
69 
70 typedef struct {
71  const char *key;
72  const bcf_idinfo_t *val;
73 } bcf_idpair_t;
74 
75 typedef struct {
76  int32_t n[3];
77  bcf_idpair_t *id[3];
78  void *dict[3]; // ID dictionary, contig dict and sample dict
79  char **samples;
81  int nhrec;
82  int ntransl, *transl[2]; // for bcf_translate()
83  int nsamples_ori; // for bcf_hdr_set_samples()
86 } bcf_hdr_t;
87 
88 extern uint8_t bcf_type_shift[];
89 
90 /**************
91  * VCF record *
92  **************/
93 
94 #define BCF_BT_NULL 0
95 #define BCF_BT_INT8 1
96 #define BCF_BT_INT16 2
97 #define BCF_BT_INT32 3
98 #define BCF_BT_FLOAT 5
99 #define BCF_BT_CHAR 7
100 
101 #define VCF_REF 0
102 #define VCF_SNP 1
103 #define VCF_MNP 2
104 #define VCF_INDEL 4
105 #define VCF_OTHER 8
106 
107 typedef struct {
108  int type, n; // variant type and the number of bases affected, negative for deletions
109 } variant_t;
110 
111 typedef struct {
112  int id; // id: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$id].key
113  int n, size, type; // n: number of values per-sample; size: number of bytes per-sample; type: one of BCF_BT_* types
114  uint8_t *p; // same as vptr and vptr_* in bcf_info_t below
116  uint32_t p_off:31, p_free:1;
117 } bcf_fmt_t;
118 
119 typedef struct {
120  int key; // key: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$key].key
121  int type, len; // type: one of BCF_BT_* types; len: vector length, 1 for scalars
122  union {
123  int32_t i; // integer value
124  float f; // float value
125  } v1; // only set if $len==1; for easier access
126  uint8_t *vptr; // pointer to data array in bcf1_t->shared.s, excluding the size+type and tag id bytes
127  uint32_t vptr_len; // length of the vptr block or, when set, of the vptr_mod block, excluding offset
128  uint32_t vptr_off:31, // vptr offset, i.e., the size of the INFO key plus size+type bytes
129  vptr_free:1; // indicates that vptr-vptr_off must be freed; set only when modified and the new
130  // data block is bigger than the original
131 } bcf_info_t;
132 
133 
134 #define BCF1_DIRTY_ID 1
135 #define BCF1_DIRTY_ALS 2
136 #define BCF1_DIRTY_FLT 4
137 #define BCF1_DIRTY_INF 8
138 
139 typedef struct {
140  int m_fmt, m_info, m_id, m_als, m_allele, m_flt; // allocated size (high-water mark); do not change
141  int n_flt; // Number of FILTER fields
142  int *flt; // FILTER keys in the dictionary
143  char *id, *als; // ID and REF+ALT block (\0-seperated)
144  char **allele; // allele[0] is the REF (allele[] pointers to the als block); all null terminated
145  bcf_info_t *info; // INFO
146  bcf_fmt_t *fmt; // FORMAT and individual sample
147  variant_t *var; // $var and $var_type set only when set_variant_types called
148  int n_var, var_type;
149  int shared_dirty; // if set, shared.s must be recreated on BCF output
150  int indiv_dirty; // if set, indiv.s must be recreated on BCF output
151 } bcf_dec_t;
152 
153 
154 #define BCF_ERR_CTG_UNDEF 1
155 #define BCF_ERR_TAG_UNDEF 2
156 #define BCF_ERR_NCOLS 4
157 
158 /*
159  The bcf1_t structure corresponds to one VCF/BCF line. Reading from VCF file
160  is slower because the string is first to be parsed, packed into BCF line
161  (done in vcf_parse), then unpacked into internal bcf1_t structure. If it
162  is known in advance that some of the fields will not be required (notably
163  the sample columns), parsing of these can be skipped by setting max_unpack
164  appropriately.
165  Similarly, it is fast to output a BCF line because the columns (kept in
166  shared.s, indiv.s, etc.) are written directly by bcf_write, whereas a VCF
167  line must be formatted in vcf_format.
168  */
169 typedef struct {
170  int32_t rid; // CHROM
171  int32_t pos; // POS
172  int32_t rlen; // length of REF
173  float qual; // QUAL
174  uint32_t n_info:16, n_allele:16;
175  uint32_t n_fmt:8, n_sample:24;
177  bcf_dec_t d; // lazy evaluation: $d is not generated by bcf_read(), but by explicitly calling bcf_unpack()
178  int max_unpack; // Set to BCF_UN_STR, BCF_UN_FLT, or BCF_UN_INFO to boost performance of vcf_parse when some of the fields won't be needed
179  int unpacked; // remember what has been unpacked to allow calling bcf_unpack() repeatedly without redoing the work
180  int unpack_size[3]; // the original block size of ID, REF+ALT and FILTER
181  int errcode; // one of BCF_ERR_* codes
182 } bcf1_t;
183 
184 /*******
185  * API *
186  *******/
187 
188 #ifdef __cplusplus
189 extern "C" {
190 #endif
191 
192  /***********************************************************************
193  * BCF and VCF I/O
194  *
195  * A note about naming conventions: htslib internally represents VCF
196  * records as bcf1_t data structures, therefore most functions are
197  * prefixed with bcf_. There are a few exceptions where the functions must
198  * be aware of both BCF and VCF worlds, such as bcf_parse vs vcf_parse. In
199  * these cases, functions prefixed with bcf_ are more general and work
200  * with both BCF and VCF.
201  *
202  ***********************************************************************/
203 
205  #define bcf_init1() bcf_init()
206  #define bcf_read1(fp,h,v) bcf_read((fp),(h),(v))
207  #define vcf_read1(fp,h,v) vcf_read((fp),(h),(v))
208  #define bcf_write1(fp,h,v) bcf_write((fp),(h),(v))
209  #define vcf_write1(fp,h,v) vcf_write((fp),(h),(v))
210  #define bcf_destroy1(v) bcf_destroy(v)
211  #define vcf_parse1(s,h,v) vcf_parse((s),(h),(v))
212  #define bcf_clear1(v) bcf_clear(v)
213  #define vcf_format1(h,v,s) vcf_format((h),(v),(s))
214 
222  bcf_hdr_t *bcf_hdr_init(const char *mode);
223 
225  void bcf_hdr_destroy(bcf_hdr_t *h);
226 
228  bcf1_t *bcf_init(void);
229 
231  void bcf_destroy(bcf1_t *v);
232 
237  void bcf_empty(bcf1_t *v);
238 
244  void bcf_clear(bcf1_t *v);
245 
246 
248  typedef htsFile vcfFile;
249  #define bcf_open(fn, mode) hts_open((fn), (mode))
250  #define vcf_open(fn, mode) hts_open((fn), (mode))
251  #define bcf_close(fp) hts_close(fp)
252  #define vcf_close(fp) hts_close(fp)
253 
256 
279  int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file);
280  int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec);
281 
282 
284  int bcf_hdr_write(htsFile *fp, const bcf_hdr_t *h);
285 
287  int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v);
288 
290  int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s);
291 
300  int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v);
301 
309  #define BCF_UN_STR 1 // up to ALT inclusive
310  #define BCF_UN_FLT 2 // up to FILTER
311  #define BCF_UN_INFO 4 // up to INFO
312  #define BCF_UN_SHR (BCF_UN_STR|BCF_UN_FLT|BCF_UN_INFO) // all shared information
313  #define BCF_UN_FMT 8 // unpack format and each sample
314  #define BCF_UN_IND BCF_UN_FMT // a synonymo of BCF_UN_FMT
315  #define BCF_UN_ALL (BCF_UN_SHR|BCF_UN_FMT) // everything
316  int bcf_unpack(bcf1_t *b, int which);
317 
318  /*
319  * bcf_dup() - create a copy of BCF record.
320  *
321  * Note that bcf_unpack() must be called on the returned copy as if it was
322  * obtained from bcf_read(). Also note that bcf_dup() calls bcf_sync1(src)
323  * internally to reflect any changes made by bcf_update_* functions.
324  */
325  bcf1_t *bcf_dup(bcf1_t *src);
326 
330  int bcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v);
331 
338  int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h);
339  int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v);
340  int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v);
341 
343  int bcf_readrec(BGZF *fp, void *null, void *v, int *tid, int *beg, int *end);
344 
345 
346 
347  /**************************************************************************
348  * Header querying and manipulation routines
349  **************************************************************************/
350 
352  bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr);
354  void bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src);
355 
361  int bcf_hdr_add_sample(bcf_hdr_t *hdr, const char *sample);
362 
364  int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname);
365 
370  char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len);
371 
373  int bcf_hdr_append(bcf_hdr_t *h, const char *line);
374  int bcf_hdr_printf(bcf_hdr_t *h, const char *format, ...);
375 
376  const char *bcf_hdr_get_version(const bcf_hdr_t *hdr);
377  void bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version);
378 
384  void bcf_hdr_remove(bcf_hdr_t *h, int type, const char *key);
385 
397  bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap);
398 
400  const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *nseqs);
401 
403  #define bcf_hdr_nsamples(hdr) (hdr)->n[BCF_DT_SAMPLE]
404 
405 
407  bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len);
408  void bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str);
409  int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec);
410  bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *id); // type is one of BCF_HL_FLT,..,BCF_HL_CTG
412  void bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, int len);
413  void bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, int len, int is_quoted);
414  int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key);
415  void bcf_hrec_destroy(bcf_hrec_t *hrec);
416 
417 
418 
419  /**************************************************************************
420  * Individual record querying and manipulation routines
421  **************************************************************************/
422 
424  int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap);
425 
433  int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *src_line);
434 
438  int bcf_get_variant_types(bcf1_t *rec);
439  int bcf_get_variant_type(bcf1_t *rec, int ith_allele);
440  int bcf_is_snp(bcf1_t *v);
441 
447  int bcf_update_filter(const bcf_hdr_t *hdr, bcf1_t *line, int *flt_ids, int n);
454  int bcf_add_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id);
460  int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int pass);
464  int bcf_has_filter(const bcf_hdr_t *hdr, bcf1_t *line, char *filter);
476  int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals);
477  int bcf_update_alleles_str(const bcf_hdr_t *hdr, bcf1_t *line, const char *alleles_string);
478  int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id);
479 
480  /*
481  * bcf_update_info_*() - functions for updating INFO fields
482  * @hdr: the BCF header
483  * @line: VCF line to be edited
484  * @key: the INFO tag to be updated
485  * @values: pointer to the array of values. Pass NULL to remove the tag.
486  * @n: number of values in the array. When set to 0, the INFO tag is removed
487  *
488  * The @string in bcf_update_info_flag() is optional, @n indicates whether
489  * the flag is set or removed.
490  *
491  * Returns 0 on success or negative value on error.
492  */
493  #define bcf_update_info_int32(hdr,line,key,values,n) bcf_update_info((hdr),(line),(key),(values),(n),BCF_HT_INT)
494  #define bcf_update_info_float(hdr,line,key,values,n) bcf_update_info((hdr),(line),(key),(values),(n),BCF_HT_REAL)
495  #define bcf_update_info_flag(hdr,line,key,string,n) bcf_update_info((hdr),(line),(key),(string),(n),BCF_HT_FLAG)
496  #define bcf_update_info_string(hdr,line,key,string) bcf_update_info((hdr),(line),(key),(string),1,BCF_HT_STR)
497  int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type);
498 
499  /*
500  * bcf_update_format_*() - functions for updating FORMAT fields
501  * @values: pointer to the array of values, the same number of elements
502  * is expected for each sample. Missing values must be padded
503  * with bcf_*_missing or bcf_*_vector_end values.
504  * @n: number of values in the array. If n==0, existing tag is removed.
505  *
506  * The function bcf_update_format_string() is a higher-level (slower) variant of
507  * bcf_update_format_char(). The former accepts array of \0-terminated strings
508  * whereas the latter requires that the strings are collapsed into a single array
509  * of fixed-length strings. In case of strings with variable length, shorter strings
510  * can be \0-padded. Note that the collapsed strings passed to bcf_update_format_char()
511  * are not \0-terminated.
512  *
513  * Returns 0 on success or negative value on error.
514  */
515  #define bcf_update_format_int32(hdr,line,key,values,n) bcf_update_format((hdr),(line),(key),(values),(n),BCF_HT_INT)
516  #define bcf_update_format_float(hdr,line,key,values,n) bcf_update_format((hdr),(line),(key),(values),(n),BCF_HT_REAL)
517  #define bcf_update_format_char(hdr,line,key,values,n) bcf_update_format((hdr),(line),(key),(values),(n),BCF_HT_STR)
518  #define bcf_update_genotypes(hdr,line,gts,n) bcf_update_format((hdr),(line),"GT",(gts),(n),BCF_HT_INT) // See bcf_gt_ macros below
519  int bcf_update_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char **values, int n);
520  int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type);
521 
522  // Macros for setting genotypes correctly, for use with bcf_update_genotypes only; idx corresponds
523  // to VCF's GT (1-based index to ALT or 0 for the reference allele) and val is the opposite, obtained
524  // from bcf_get_genotypes() below.
525  #define bcf_gt_phased(idx) ((idx+1)<<1|1)
526  #define bcf_gt_unphased(idx) ((idx+1)<<1)
527  #define bcf_gt_missing 0
528  #define bcf_gt_is_phased(idx) ((idx)&1)
529  #define bcf_gt_allele(val) (((val)>>1)-1)
530 
532  #define bcf_alleles2gt(a,b) ((a)>(b)?((a)*((a)+1)/2+(b)):((b)*((b)+1)/2+(a)))
533  static inline void bcf_gt2alleles(int igt, int *a, int *b)
534  {
535  int k = 0, dk = 1;
536  while ( k<igt ) { dk++; k += dk; }
537  *b = dk - 1; *a = igt - k + *b;
538  }
539 
549  bcf_fmt_t *bcf_get_fmt(const bcf_hdr_t *hdr, bcf1_t *line, const char *key);
550  bcf_info_t *bcf_get_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key);
551 
570  #define bcf_get_info_int32(hdr,line,tag,dst,ndst) bcf_get_info_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_INT)
571  #define bcf_get_info_float(hdr,line,tag,dst,ndst) bcf_get_info_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_REAL)
572  #define bcf_get_info_string(hdr,line,tag,dst,ndst) bcf_get_info_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_STR)
573  #define bcf_get_info_flag(hdr,line,tag,dst,ndst) bcf_get_info_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_FLAG)
574  int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type);
575 
597  #define bcf_get_format_int32(hdr,line,tag,dst,ndst) bcf_get_format_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_INT)
598  #define bcf_get_format_float(hdr,line,tag,dst,ndst) bcf_get_format_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_REAL)
599  #define bcf_get_format_char(hdr,line,tag,dst,ndst) bcf_get_format_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_STR)
600  #define bcf_get_genotypes(hdr,line,dst,ndst) bcf_get_format_values(hdr,line,"GT",(void**)(dst),ndst,BCF_HT_INT)
601  int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst);
602  int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type);
603 
604 
605 
606  /**************************************************************************
607  * Helper functions
608  **************************************************************************/
609 
619  int bcf_hdr_id2int(const bcf_hdr_t *hdr, int type, const char *id);
620  #define bcf_hdr_int2id(hdr,type,int_id) ((hdr)->id[type][int_id].key)
621 
626  static inline int bcf_hdr_name2id(const bcf_hdr_t *hdr, const char *id) { return bcf_hdr_id2int(hdr, BCF_DT_CTG, id); }
627  static inline const char *bcf_hdr_id2name(const bcf_hdr_t *hdr, int rid) { return hdr->id[BCF_DT_CTG][rid].key; }
628  static inline const char *bcf_seqname(const bcf_hdr_t *hdr, bcf1_t *rec) { return hdr->id[BCF_DT_CTG][rec->rid].key; }
629 
644  #define bcf_hdr_id2length(hdr,type,int_id) ((hdr)->id[BCF_DT_ID][int_id].val->info[type]>>8 & 0xf)
645  #define bcf_hdr_id2number(hdr,type,int_id) ((hdr)->id[BCF_DT_ID][int_id].val->info[type]>>12)
646  #define bcf_hdr_id2type(hdr,type,int_id) ((hdr)->id[BCF_DT_ID][int_id].val->info[type]>>4 & 0xf)
647  #define bcf_hdr_id2coltype(hdr,type,int_id) ((hdr)->id[BCF_DT_ID][int_id].val->info[type] & 0xf)
648  #define bcf_hdr_idinfo_exists(hdr,type,int_id) ((int_id<0 || bcf_hdr_id2coltype(hdr,type,int_id)==0xf) ? 0 : 1)
649  #define bcf_hdr_id2hrec(hdr,dict_type,col_type,int_id) ((hdr)->id[(dict_type)==BCF_DT_CTG?BCF_DT_CTG:BCF_DT_ID][int_id].val->hrec[(dict_type)==BCF_DT_CTG?0:(col_type)])
650 
651  void bcf_fmt_array(kstring_t *s, int n, int type, void *data);
653 
654  void bcf_enc_vchar(kstring_t *s, int l, const char *a);
655  void bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize);
656  void bcf_enc_vfloat(kstring_t *s, int n, float *a);
657 
658 
659  /**************************************************************************
660  * BCF index
661  *
662  * Note that these functions work with BCFs only. See synced_bcf_reader.h
663  * which provides (amongst other things) an API to work transparently with
664  * both indexed BCFs and VCFs.
665  **************************************************************************/
666 
667  #define bcf_itr_destroy(iter) hts_itr_destroy(iter)
668  #define bcf_itr_queryi(idx, tid, beg, end) hts_itr_query((idx), (tid), (beg), (end), bcf_readrec)
669  #define bcf_itr_querys(idx, hdr, s) hts_itr_querys((idx), (s), (hts_name2id_f)(bcf_hdr_name2id), (hdr), hts_itr_query, bcf_readrec)
670  #define bcf_itr_next(htsfp, itr, r) hts_itr_next((htsfp)->fp.bgzf, (itr), (r), 0)
671  #define bcf_index_load(fn) hts_idx_load(fn, HTS_FMT_CSI)
672  #define bcf_index_seqnames(idx, hdr, nptr) hts_idx_seqnames((idx),(nptr),(hts_id2name_f)(bcf_hdr_id2name),(hdr))
673 
674  int bcf_index_build(const char *fn, int min_shift);
675 
676 #ifdef __cplusplus
677 }
678 #endif
679 
680 /*******************
681  * Typed value I/O *
682  *******************/
683 
684 /*
685  Note that in contrast with BCFv2.1 specification, HTSlib implementation
686  allows missing values in vectors. For integer types, the values 0x80,
687  0x8000, 0x80000000 are interpreted as missing values and 0x81, 0x8001,
688  0x80000001 as end-of-vector indicators. Similarly for floats, the value of
689  0x7F800001 is interpreted as a missing value and 0x7F800002 as an
690  end-of-vector indicator.
691  Note that the end-of-vector byte is not part of the vector.
692 
693  This trial BCF version (v2.2) is compatible with the VCF specification and
694  enables to handle correctly vectors with different ploidy in presence of
695  missing values.
696  */
697 #define bcf_int8_vector_end (INT8_MIN+1)
698 #define bcf_int16_vector_end (INT16_MIN+1)
699 #define bcf_int32_vector_end (INT32_MIN+1)
700 #define bcf_str_vector_end 0
701 #define bcf_int8_missing INT8_MIN
702 #define bcf_int16_missing INT16_MIN
703 #define bcf_int32_missing INT32_MIN
704 #define bcf_str_missing 0x07
707 static inline void bcf_float_set(float *ptr, uint32_t value)
708 {
709  union { uint32_t i; float f; } u;
710  u.i = value;
711  *ptr = u.f;
712 }
713 #define bcf_float_set_vector_end(x) bcf_float_set(&(x),bcf_float_vector_end)
714 #define bcf_float_set_missing(x) bcf_float_set(&(x),bcf_float_missing)
715 static inline int bcf_float_is_missing(float f)
716 {
717  union { uint32_t i; float f; } u;
718  u.f = f;
719  return u.i==bcf_float_missing ? 1 : 0;
720 }
721 static inline int bcf_float_is_vector_end(float f)
722 {
723  union { uint32_t i; float f; } u;
724  u.f = f;
725  return u.i==bcf_float_vector_end ? 1 : 0;
726 }
727 
728 static inline void bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str)
729 {
730  #define BRANCH(type_t, missing, vector_end) { \
731  type_t *ptr = (type_t*) (fmt->p + isample*fmt->size); \
732  int i; \
733  for (i=0; i<fmt->n && ptr[i]!=vector_end; i++) \
734  { \
735  if ( i ) kputc("/|"[ptr[i]&1], str); \
736  if ( !(ptr[i]>>1) ) kputc('.', str); \
737  else kputw((ptr[i]>>1) - 1, str); \
738  } \
739  if (i == 0) kputc('.', str); \
740  }
741  switch (fmt->type) {
745  default: fprintf(stderr,"FIXME: type %d in bcf_format_gt?\n", fmt->type); abort(); break;
746  }
747  #undef BRANCH
748 }
749 
750 static inline void bcf_enc_size(kstring_t *s, int size, int type)
751 {
752  if (size >= 15) {
753  kputc(15<<4|type, s);
754  if (size >= 128) {
755  if (size >= 32768) {
756  int32_t x = size;
757  kputc(1<<4|BCF_BT_INT32, s);
758  kputsn((char*)&x, 4, s);
759  } else {
760  int16_t x = size;
761  kputc(1<<4|BCF_BT_INT16, s);
762  kputsn((char*)&x, 2, s);
763  }
764  } else {
765  kputc(1<<4|BCF_BT_INT8, s);
766  kputc(size, s);
767  }
768  } else kputc(size<<4|type, s);
769 }
770 
771 static inline int bcf_enc_inttype(long x)
772 {
773  if (x <= INT8_MAX && x > bcf_int8_missing) return BCF_BT_INT8;
774  if (x <= INT16_MAX && x > bcf_int16_missing) return BCF_BT_INT16;
775  return BCF_BT_INT32;
776 }
777 
778 static inline void bcf_enc_int1(kstring_t *s, int32_t x)
779 {
780  if (x == bcf_int32_vector_end) {
781  bcf_enc_size(s, 1, BCF_BT_INT8);
782  kputc(bcf_int8_vector_end, s);
783  } else if (x == bcf_int32_missing) {
784  bcf_enc_size(s, 1, BCF_BT_INT8);
785  kputc(bcf_int8_missing, s);
786  } else if (x <= INT8_MAX && x > bcf_int8_missing) {
787  bcf_enc_size(s, 1, BCF_BT_INT8);
788  kputc(x, s);
789  } else if (x <= INT16_MAX && x > bcf_int16_missing) {
790  int16_t z = x;
791  bcf_enc_size(s, 1, BCF_BT_INT16);
792  kputsn((char*)&z, 2, s);
793  } else {
794  int32_t z = x;
795  bcf_enc_size(s, 1, BCF_BT_INT32);
796  kputsn((char*)&z, 4, s);
797  }
798 }
799 
800 static inline int32_t bcf_dec_int1(const uint8_t *p, int type, uint8_t **q)
801 {
802  if (type == BCF_BT_INT8) {
803  *q = (uint8_t*)p + 1;
804  return *(int8_t*)p;
805  } else if (type == BCF_BT_INT16) {
806  *q = (uint8_t*)p + 2;
807  return *(int16_t*)p;
808  } else {
809  *q = (uint8_t*)p + 4;
810  return *(int32_t*)p;
811  }
812 }
813 
814 static inline int32_t bcf_dec_typed_int1(const uint8_t *p, uint8_t **q)
815 {
816  return bcf_dec_int1(p + 1, *p&0xf, q);
817 }
818 
819 static inline int32_t bcf_dec_size(const uint8_t *p, uint8_t **q, int *type)
820 {
821  *type = *p & 0xf;
822  if (*p>>4 != 15) {
823  *q = (uint8_t*)p + 1;
824  return *p>>4;
825  } else return bcf_dec_typed_int1(p + 1, q);
826 }
827 
828 #endif