NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
synced_bcf_reader.h
Go to the documentation of this file.
1 /*
2  The synced_bcf_reader allows to keep multiple VCFs open and stream them
3  using the next_line iterator in a seamless matter without worrying about
4  chromosomes and synchronizing the sites. This is used by vcfcheck to
5  compare multiple VCFs simultaneously and is used also for merging,
6  creating intersections, etc.
7 
8  The synced_bcf_reader also provides API for reading indexed BCF/VCF,
9  hiding differences in BCF/VCF opening, indexing and reading.
10 
11 
12  Example of usage:
13 
14  bcf_srs_t *sr = bcf_sr_init();
15  for (i=0; i<nfiles; i++)
16  bcf_sr_add_reader(sr,files[i]);
17  while ( bcf_sr_next_line(sr) )
18  {
19  for (i=0; i<nfiles; i++)
20  {
21  bcf1_t *line = bcf_sr_get_line(sr,i);
22  ...
23  }
24  }
25  bcf_sr_destroy(sr);
26 */
27 
28 #ifndef SYNCED_BCF_READER_H
29 #define SYNCED_BCF_READER_H
30 
31 #include "hts.h"
32 #include "vcf.h"
33 #include "tbx.h"
34 
35 // How should be treated sites with the same position but different alleles
36 #define COLLAPSE_NONE 0 // require the exact same set of alleles in all files
37 #define COLLAPSE_SNPS 1 // allow different alleles, as long as they all are SNPs
38 #define COLLAPSE_INDELS 2 // the same as above, but with indels
39 #define COLLAPSE_ANY 4 // any combination of alleles can be returned by bcf_sr_next_line()
40 #define COLLAPSE_SOME 8 // at least some of the ALTs must match
41 #define COLLAPSE_BOTH (COLLAPSE_SNPS|COLLAPSE_INDELS)
42 
43 typedef struct _bcf_sr_regions_t
44 {
45  // for reading from tabix-indexed file (big data)
46  tbx_t *tbx; // tabix index
47  hts_itr_t *itr; // tabix iterator
48  kstring_t line; // holder of the current line, set only when reading from tabix-indexed files
50  char *fname;
51  int is_bin; // is open in binary mode (tabix access)
52  char **als; // parsed alleles if targets_als set and _regions_match_alleles called
53  kstring_t als_str; // block of parsed alleles
54  int nals, mals; // number of set alleles and the size of allocated array
55  int als_type; // alleles type, currently VCF_SNP or VCF_INDEL
56 
57  // user handler to deal with skipped regions without a counterpart in VCFs
58  void (*missed_reg_handler)(struct _bcf_sr_regions_t *, void *);
60 
61  // for in-memory regions (small data)
62  struct _region_t *regs; // the regions
63 
64  // shared by both tabix-index and in-memory regions
65  void *seq_hash; // keys: sequence names, values: index to seqs
66  char **seq_names; // sequence names
67  int nseqs; // number of sequences (chromosomes) in the file
68  int iseq; // current position: chr name, index to snames
69  int start, end; // current position: start, end of the region (0-based)
71 }
73 
74 typedef struct
75 {
81  const char *fname;
82  bcf1_t **buffer; // cached VCF records. First is the current record synced across the reader
83  int nbuffer, mbuffer; // number of cached records (including the current record); number of allocated records
84  int nfilter_ids, *filter_ids; // -1 for ".", otherwise filter id as returned by bcf_id2int
85  int type;
86  int *samples, n_smpl; // list of columns in the order consistent with bcf_srs_t.samples
87 }
88 bcf_sr_t;
89 
90 typedef struct
91 {
92  // Parameters controlling the logic
93  int collapse; // How should the duplicate sites be treated. One of the COLLAPSE_* types above.
94  char *apply_filters; // If set, sites where none of the FILTER strings is listed
95  // will be skipped. Active only at the time of
96  // initialization, that is during the add_reader()
97  // calls. Therefore, each reader can be initialized with different
98  // filters.
99  int require_index; // Some tools do not need random access
100  int max_unpack; // When reading VCFs and knowing some fields will not be needed, boost performance of vcf_parse1
101  int *has_line; // Corresponds to return value of bcf_sr_next_line but is not limited by sizeof(int). Use bcf_sr_has_line macro to query.
102 
103  // Auxiliary data
105  int nreaders;
106  int streaming; // reading mode: index-jumping or streaming
107  int explicit_regs; // was the list of regions se by bcf_sr_set_regions or guessed from tabix index?
108  char **samples; // List of samples
109  bcf_sr_regions_t *regions, *targets; // see bcf_sr_set_[targets|regions] for description
110  int targets_als; // subset to targets not only by position but also by alleles? (todo)
112  int n_smpl;
113 }
114 bcf_srs_t;
115 
117 bcf_srs_t *bcf_sr_init(void);
118 
120 void bcf_sr_destroy(bcf_srs_t *readers);
121 
132 int bcf_sr_add_reader(bcf_srs_t *readers, const char *fname);
133 void bcf_sr_remove_reader(bcf_srs_t *files, int i);
134 
135 
144 int bcf_sr_next_line(bcf_srs_t *readers);
145 #define bcf_sr_has_line(readers, i) (readers)->has_line[i]
146 #define bcf_sr_get_line(_readers, i) ((_readers)->has_line[i] ? ((_readers)->readers[i].buffer[0]) : NULL)
147 #define bcf_sr_region_done(_readers,i) (!(_readers)->has_line[i] && !(_readers)->readers[i].nbuffer ? 1 : 0)
148 
154 int bcf_sr_seek(bcf_srs_t *readers, const char *seq, int pos);
155 
167 int bcf_sr_set_samples(bcf_srs_t *readers, const char *samples, int is_file);
168 
193 int bcf_sr_set_targets(bcf_srs_t *readers, const char *targets, int is_file, int alleles);
194 int bcf_sr_set_regions(bcf_srs_t *readers, const char *regions, int is_file);
195 
196 
197 
198 /*
199  * bcf_sr_regions_init()
200  * @regions: regions can be either a comma-separated list of regions
201  * (chr|chr:pos|chr:from-to|chr:from-) or VCF, BED, or
202  * tab-delimited file (the default). Uncompressed files
203  * are stored in memory while bgzip-compressed and tabix-indexed
204  * region files are streamed.
205  * @is_file: 0: regions is a comma-separated list of regions
206  * (chr|chr:pos|chr:from-to|chr:from-)
207  * 1: VCF, BED or tab-delimited file
208  * @chr, from, to:
209  * Column indexes of chromosome, start position and end position
210  * in the tab-delimited file. The positions are 1-based and
211  * inclusive.
212  * These parameters are ignored when reading from VCF, BED or
213  * tabix-indexed files. When end position column is not present,
214  * supply 'from' in place of 'to'. When 'to' is negative, first
215  * abs(to) will be attempted and if that fails, 'from' will be used
216  * instead.
217  */
218 bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int chr, int from, int to);
220 
221 /*
222  * bcf_sr_regions_seek() - seek to the chromosome block
223  *
224  * Returns 0 on success or -1 on failure. Sets reg->seq appropriately and
225  * reg->start,reg->end to -1.
226  */
227 int bcf_sr_regions_seek(bcf_sr_regions_t *regions, const char *chr);
228 
229 /*
230  * bcf_sr_regions_next() - retrieves next region. Returns 0 on success and -1
231  * when all regions have been read. The fields reg->seq, reg->start and
232  * reg->end are filled with the genomic coordinates on succes or with
233  * NULL,-1,-1 when no region is available. The coordinates are 0-based,
234  * inclusive.
235  */
237 
238 /*
239  * bcf_sr_regions_overlap() - checks if the interval <start,end> overlaps any of
240  * the regions, the coordinates are 0-based, inclusive. The coordinate queries
241  * must come in ascending order.
242  *
243  * Returns 0 if the position is in regions; -1 if the position is not in the
244  * regions and more regions exist; -2 if not in the regions and there are no more
245  * regions left.
246  */
247 int bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, int start, int end);
248 
249 /*
250  * bcf_sr_regions_flush() - calls repeatedly regs->missed_reg_handler() until
251  * all remaining records are processed.
252  */
254 
255 #endif