NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
synced_bcf_reader.c
Go to the documentation of this file.
1 #include <stdio.h>
2 #include <unistd.h>
3 #include <string.h>
4 #include <limits.h>
5 #include <errno.h>
6 #include <ctype.h>
7 #include <sys/stat.h>
9 #include "htslib/kseq.h"
10 #include "htslib/khash_str2int.h"
11 
12 #define MAX_CSI_COOR 0x7fffffff // maximum indexable coordinate of .csi
13 
14 typedef struct
15 {
17 }
18 region1_t;
19 
20 typedef struct _region_t
21 {
23  int nregs, mregs, creg;
24 }
25 region_t;
26 
27 static void _regions_add(bcf_sr_regions_t *reg, const char *chr, int start, int end);
28 static bcf_sr_regions_t *_regions_init_string(const char *str);
29 static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *rec);
30 
31 static int *init_filters(bcf_hdr_t *hdr, const char *filters, int *nfilters)
32 {
33  kstring_t str = {0,0,0};
34  const char *tmp = filters, *prev = filters;
35  int nout = 0, *out = NULL;
36  while ( 1 )
37  {
38  if ( *tmp==',' || !*tmp )
39  {
40  out = (int*) realloc(out, sizeof(int));
41  if ( tmp-prev==1 && *prev=='.' )
42  out[nout] = -1;
43  else
44  {
45  str.l = 0;
46  kputsn(prev, tmp-prev, &str);
47  out[nout] = bcf_hdr_id2int(hdr, BCF_DT_ID, str.s);
48  }
49  nout++;
50  if ( !*tmp ) break;
51  prev = tmp+1;
52  }
53  tmp++;
54  }
55  if ( str.m ) free(str.s);
56  *nfilters = nout;
57  return out;
58 }
59 
60 int bcf_sr_set_regions(bcf_srs_t *readers, const char *regions, int is_file)
61 {
62  assert( !readers->regions );
63  if ( readers->nreaders )
64  {
65  fprintf(stderr,"[%s:%d %s] Error: bcf_sr_set_regions() must be called before bcf_sr_add_reader()\n", __FILE__,__LINE__,__FUNCTION__);
66  return -1;
67  }
68  readers->regions = bcf_sr_regions_init(regions,is_file,0,1,-2);
69  if ( !readers->regions ) return -1;
70  readers->explicit_regs = 1;
71  readers->require_index = 1;
72  return 0;
73 }
74 int bcf_sr_set_targets(bcf_srs_t *readers, const char *targets, int is_file, int alleles)
75 {
76  assert( !readers->targets );
77  readers->targets = bcf_sr_regions_init(targets,is_file,0,1,-2);
78  if ( !readers->targets ) return -1;
79  readers->targets_als = alleles;
80  return 0;
81 }
82 
83 int bcf_sr_add_reader(bcf_srs_t *files, const char *fname)
84 {
85  files->has_line = (int*) realloc(files->has_line, sizeof(int)*(files->nreaders+1));
86  files->has_line[files->nreaders] = 0;
87  files->readers = (bcf_sr_t*) realloc(files->readers, sizeof(bcf_sr_t)*(files->nreaders+1));
88  bcf_sr_t *reader = &files->readers[files->nreaders++];
89  memset(reader,0,sizeof(bcf_sr_t));
90 
91  reader->file = hts_open(fname, "r");
92  if ( !reader->file ) return 0;
93 
94  reader->type = reader->file->is_bin? FT_BCF : FT_VCF;
95  if (reader->file->is_compressed) reader->type |= FT_GZ;
96 
97  if ( files->require_index )
98  {
99  if ( reader->type==FT_VCF_GZ )
100  {
101  reader->tbx_idx = tbx_index_load(fname);
102  if ( !reader->tbx_idx )
103  {
104  fprintf(stderr,"[add_reader] Could not load the index of %s\n", fname);
105  return 0;
106  }
107 
108  reader->header = bcf_hdr_read(reader->file);
109  }
110  else if ( reader->type==FT_BCF_GZ )
111  {
112  reader->header = bcf_hdr_read(reader->file);
113 
114  reader->bcf_idx = bcf_index_load(fname);
115  if ( !reader->bcf_idx )
116  {
117  fprintf(stderr,"[add_reader] Could not load the index of %s\n", fname);
118  return 0; // not indexed..?
119  }
120  }
121  else
122  {
123  fprintf(stderr,"Index required, expected .vcf.gz or .bcf file: %s\n", fname);
124  return 0;
125  }
126  }
127  else
128  {
129  if ( reader->type & FT_BCF )
130  {
131  reader->header = bcf_hdr_read(reader->file);
132  }
133  else if ( reader->type & FT_VCF )
134  {
135  reader->header = bcf_hdr_read(reader->file);
136  }
137  else
138  {
139  fprintf(stderr,"File type not recognised: %s\n", fname);
140  return 0;
141  }
142  files->streaming = 1;
143  }
144  if ( files->streaming && files->nreaders>1 )
145  {
146  fprintf(stderr,"[%s:%d %s] Error: %d readers, yet require_index not set\n", __FILE__,__LINE__,__FUNCTION__,files->nreaders);
147  return 0;
148  }
149  if ( files->streaming && files->regions )
150  {
151  fprintf(stderr,"[%s:%d %s] Error: cannot tabix-jump in streaming mode\n", __FILE__,__LINE__,__FUNCTION__);
152  return 0;
153  }
154  if ( !reader->header ) return 0;
155 
156  reader->fname = fname;
157  if ( files->apply_filters )
158  reader->filter_ids = init_filters(reader->header, files->apply_filters, &reader->nfilter_ids);
159 
160  // Update list of chromosomes
161  if ( !files->explicit_regs && !files->streaming )
162  {
163  int n,i;
164  const char **names = reader->tbx_idx ? tbx_seqnames(reader->tbx_idx, &n) : bcf_hdr_seqnames(reader->header, &n);
165  for (i=0; i<n; i++)
166  {
167  if ( !files->regions )
168  files->regions = _regions_init_string(names[i]);
169  else
170  _regions_add(files->regions, names[i], -1, -1);
171  }
172  free(names);
173  }
174 
175  return 1;
176 }
177 
179 {
180  bcf_srs_t *files = (bcf_srs_t*) calloc(1,sizeof(bcf_srs_t));
181  return files;
182 }
183 
184 static void bcf_sr_destroy1(bcf_sr_t *reader)
185 {
186  if ( reader->tbx_idx ) tbx_destroy(reader->tbx_idx);
187  if ( reader->bcf_idx ) hts_idx_destroy(reader->bcf_idx);
188  bcf_hdr_destroy(reader->header);
189  hts_close(reader->file);
190  if ( reader->itr ) tbx_itr_destroy(reader->itr);
191  int j;
192  for (j=0; j<reader->mbuffer; j++)
193  bcf_destroy1(reader->buffer[j]);
194  free(reader->buffer);
195  free(reader->samples);
196  free(reader->filter_ids);
197 }
199 {
200  int i;
201  for (i=0; i<files->nreaders; i++)
202  bcf_sr_destroy1(&files->readers[i]);
203  free(files->has_line);
204  free(files->readers);
205  for (i=0; i<files->n_smpl; i++) free(files->samples[i]);
206  free(files->samples);
207  if (files->targets) bcf_sr_regions_destroy(files->targets);
208  if (files->regions) bcf_sr_regions_destroy(files->regions);
209  if ( files->tmps.m ) free(files->tmps.s);
210  free(files);
211 }
212 
213 void bcf_sr_remove_reader(bcf_srs_t *files, int i)
214 {
215  assert( !files->samples ); // not ready for this yet
216  bcf_sr_destroy1(&files->readers[i]);
217  if ( i+1 < files->nreaders )
218  {
219  memmove(&files->readers[i], &files->readers[i+1], (files->nreaders-i-1)*sizeof(bcf_sr_t));
220  memmove(&files->has_line[i], &files->has_line[i+1], (files->nreaders-i-1)*sizeof(int));
221  }
222  files->nreaders--;
223 }
224 
225 
226 /*
227  Removes duplicate records from the buffer. The meaning of "duplicate" is
228  controlled by the $collapse variable, which can cause that from multiple
229  <indel|snp|any> lines only the first is considered and the rest is ignored.
230  The removal is done by setting the redundant lines' positions to -1 and
231  moving these lines at the end of the buffer.
232  */
233 static void collapse_buffer(bcf_srs_t *files, bcf_sr_t *reader)
234 {
235  int irec,jrec, has_snp=0, has_indel=0, has_any=0;
236  for (irec=1; irec<=reader->nbuffer; irec++)
237  {
238  bcf1_t *line = reader->buffer[irec];
239  if ( line->pos != reader->buffer[1]->pos ) break;
240  if ( files->collapse&COLLAPSE_ANY )
241  {
242  if ( !has_any ) has_any = 1;
243  else line->pos = -1;
244  }
245  int line_type = bcf_get_variant_types(line);
246  if ( files->collapse&COLLAPSE_SNPS && line_type&(VCF_SNP|VCF_MNP) )
247  {
248  if ( !has_snp ) has_snp = 1;
249  else line->pos = -1;
250  }
251  if ( files->collapse&COLLAPSE_INDELS && line_type&VCF_INDEL )
252  {
253  if ( !has_indel ) has_indel = 1;
254  else line->pos = -1;
255  }
256  }
257  bcf1_t *tmp;
258  irec = jrec = 1;
259  while ( irec<=reader->nbuffer && jrec<=reader->nbuffer )
260  {
261  if ( reader->buffer[irec]->pos != -1 ) { irec++; continue; }
262  if ( jrec<=irec ) jrec = irec+1;
263  while ( jrec<=reader->nbuffer && reader->buffer[jrec]->pos==-1 ) jrec++;
264  if ( jrec<=reader->nbuffer )
265  {
266  tmp = reader->buffer[irec]; reader->buffer[irec] = reader->buffer[jrec]; reader->buffer[jrec] = tmp;
267  }
268  }
269  reader->nbuffer = irec - 1;
270 }
271 
272 void debug_buffer(FILE *fp, bcf_sr_t *reader)
273 {
274  int j;
275  for (j=0; j<=reader->nbuffer; j++)
276  {
277  bcf1_t *line = reader->buffer[j];
278  fprintf(fp,"%s%s\t%s:%d\t%s ", reader->fname,j==0?"*":"",reader->header->id[BCF_DT_CTG][line->rid].key,line->pos+1,line->n_allele?line->d.allele[0]:"");
279  int k;
280  for (k=1; k<line->n_allele; k++) fprintf(fp," %s", line->d.allele[k]);
281  fprintf(fp,"\n");
282  }
283 }
284 
285 void debug_buffers(FILE *fp, bcf_srs_t *files)
286 {
287  int i;
288  for (i=0; i<files->nreaders; i++)
289  {
290  fprintf(fp, "has_line: %d\t%s\n", bcf_sr_has_line(files,i),files->readers[i].fname);
291  debug_buffer(fp, &files->readers[i]);
292  }
293  fprintf(fp,"\n");
294 }
295 
296 static inline int has_filter(bcf_sr_t *reader, bcf1_t *line)
297 {
298  int i, j;
299  if ( !line->d.n_flt )
300  {
301  for (j=0; j<reader->nfilter_ids; j++)
302  if ( reader->filter_ids[j]<0 ) return 1;
303  return 0;
304  }
305  for (i=0; i<line->d.n_flt; i++)
306  {
307  for (j=0; j<reader->nfilter_ids; j++)
308  if ( line->d.flt[i]==reader->filter_ids[j] ) return 1;
309  }
310  return 0;
311 }
312 
313 static int _reader_seek(bcf_sr_t *reader, const char *seq, int start, int end)
314 {
315  if ( end>=MAX_CSI_COOR )
316  {
317  fprintf(stderr,"The coordinate is out of csi index limit: %d\n", end+1);
318  exit(1);
319  }
320  if ( reader->itr )
321  {
322  hts_itr_destroy(reader->itr);
323  reader->itr = NULL;
324  }
325  reader->nbuffer = 0;
326  if ( reader->tbx_idx )
327  {
328  int tid = tbx_name2id(reader->tbx_idx, seq);
329  if ( tid==-1 ) return -1; // the sequence not present in this file
330  reader->itr = tbx_itr_queryi(reader->tbx_idx,tid,start,end+1);
331  }
332  else
333  {
334  int tid = bcf_hdr_name2id(reader->header, seq);
335  if ( tid==-1 ) return -1; // the sequence not present in this file
336  reader->itr = bcf_itr_queryi(reader->bcf_idx,tid,start,end+1);
337  }
338  assert(reader->itr);
339  return 0;
340 }
341 
342 /*
343  * _readers_next_region() - jumps to next region if necessary
344  * Returns 0 on success or -1 when there are no more regions left
345  */
346 static int _readers_next_region(bcf_srs_t *files)
347 {
348  // Need to open new chromosome? Check number of lines in all readers' buffers
349  int i, eos = 0;
350  for (i=0; i<files->nreaders; i++)
351  if ( !files->readers[i].itr && !files->readers[i].nbuffer ) eos++;
352 
353  if ( eos!=files->nreaders )
354  {
355  // Some of the readers still has buffered lines
356  return 0;
357  }
358 
359  // No lines in the buffer, need to open new region or quit
360  if ( bcf_sr_regions_next(files->regions)<0 ) return -1;
361 
362  for (i=0; i<files->nreaders; i++)
363  _reader_seek(&files->readers[i],files->regions->seq_names[files->regions->iseq],files->regions->start,files->regions->end);
364 
365  return 0;
366 }
367 
368 /*
369  * _reader_fill_buffer() - buffers all records with the same coordinate
370  */
371 static void _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader)
372 {
373  // Return if the buffer is full: the coordinate of the last buffered record differs
374  if ( reader->nbuffer && reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) return;
375 
376  // No iterator (sequence not present in this file) and not streaming
377  if ( !reader->itr && !files->streaming ) return;
378 
379  // Fill the buffer with records starting at the same position
380  int i, ret = 0;
381  while (1)
382  {
383  if ( reader->nbuffer+1 >= reader->mbuffer )
384  {
385  // Increase buffer size
386  reader->mbuffer += 8;
387  reader->buffer = (bcf1_t**) realloc(reader->buffer, sizeof(bcf1_t*)*reader->mbuffer);
388  for (i=8; i>0; i--) // initialize
389  {
390  reader->buffer[reader->mbuffer-i] = bcf_init1();
391  reader->buffer[reader->mbuffer-i]->max_unpack = files->max_unpack;
392  reader->buffer[reader->mbuffer-i]->pos = -1; // for rare cases when VCF starts from 1
393  }
394  }
395  if ( files->streaming )
396  {
397  if ( reader->type & FT_VCF )
398  {
399  if ( (ret=hts_getline(reader->file, KS_SEP_LINE, &files->tmps)) < 0 ) break; // no more lines
400  int ret = vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]);
401  if ( ret<0 ) break;
402  }
403  else if ( reader->type & FT_BCF )
404  {
405  if ( (ret=bcf_read1(reader->file, reader->header, reader->buffer[reader->nbuffer+1])) < 0 ) break; // no more lines
406  }
407  else
408  {
409  fprintf(stderr,"[%s:%d %s] fixme: not ready for this\n", __FILE__,__LINE__,__FUNCTION__);
410  exit(1);
411  }
412  }
413  else if ( reader->tbx_idx )
414  {
415  if ( (ret=tbx_itr_next(reader->file, reader->tbx_idx, reader->itr, &files->tmps)) < 0 ) break; // no more lines
416  vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]);
417  }
418  else
419  {
420  if ( (ret=bcf_itr_next(reader->file, reader->itr, reader->buffer[reader->nbuffer+1])) < 0 ) break; // no more lines
421  bcf_subset_format(reader->header,reader->buffer[reader->nbuffer+1]);
422  }
423 
424  // apply filter
425  if ( !reader->nfilter_ids )
426  bcf_unpack(reader->buffer[reader->nbuffer+1], BCF_UN_STR);
427  else
428  {
429  bcf_unpack(reader->buffer[reader->nbuffer+1], BCF_UN_STR|BCF_UN_FLT);
430  if ( !has_filter(reader, reader->buffer[reader->nbuffer+1]) ) continue;
431  }
432  reader->nbuffer++;
433 
434  if ( reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) break; // the buffer is full
435  }
436  if ( ret<0 )
437  {
438  // done for this region
439  tbx_itr_destroy(reader->itr);
440  reader->itr = NULL;
441  }
442  if ( files->collapse && reader->nbuffer>=2 && reader->buffer[1]->pos==reader->buffer[2]->pos )
443  collapse_buffer(files, reader);
444 }
445 
446 /*
447  * _readers_shift_buffer() - removes the first line and all subsequent lines with the same position
448  */
449 static void _reader_shift_buffer(bcf_sr_t *reader)
450 {
451  int i;
452  for (i=2; i<=reader->nbuffer; i++)
453  if ( reader->buffer[i]->pos!=reader->buffer[1]->pos ) break;
454  if ( i<=reader->nbuffer )
455  {
456  // A record with a different position follows, swap it. Because of the reader's logic,
457  // only one such line can be present.
458  bcf1_t *tmp = reader->buffer[1]; reader->buffer[1] = reader->buffer[i]; reader->buffer[i] = tmp;
459  reader->nbuffer = 1;
460  }
461  else
462  reader->nbuffer = 0; // no other line
463 }
464 
465 /*
466  * _reader_match_alleles() - from multiple buffered lines selects the one which
467  * corresponds best to the template line. The logic is controlled by COLLAPSE_*
468  * Returns 0 on success or -1 when no good matching line is found.
469  */
470 static int _reader_match_alleles(bcf_srs_t *files, bcf_sr_t *reader, bcf1_t *tmpl)
471 {
472  int i, irec = -1;
473 
474  // if no template given, use the first available record
475  if ( !tmpl )
476  irec = 1;
477  else
478  {
479  int tmpl_type = bcf_get_variant_types(tmpl);
480  for (i=1; i<=reader->nbuffer; i++)
481  {
482  bcf1_t *line = reader->buffer[i];
483  if ( line->pos != reader->buffer[1]->pos ) break; // done with this reader
484 
485  // Easiest case: matching by position only
486  if ( files->collapse&COLLAPSE_ANY ) { irec=i; break; }
487 
488  int line_type = bcf_get_variant_types(line);
489 
490  // No matter what the alleles are, as long as they are both SNPs
491  if ( files->collapse&COLLAPSE_SNPS && tmpl_type&VCF_SNP && line_type&VCF_SNP ) { irec=i; break; }
492  // ... or indels
493  if ( files->collapse&COLLAPSE_INDELS && tmpl_type&VCF_INDEL && line_type&VCF_INDEL ) { irec=i; break; }
494 
495  // More thorough checking: REFs must match
496  if ( tmpl->rlen != line->rlen ) continue; // different length
497  if ( strcmp(tmpl->d.allele[0], line->d.allele[0]) ) continue; // the strings do not match
498 
499  int ial,jal;
500  if ( files->collapse==COLLAPSE_NONE )
501  {
502  // Exact match, all alleles must be identical
503  if ( tmpl->n_allele!=line->n_allele ) continue; // different number of alleles, skip
504 
505  int nmatch = 1; // REF has been already checked
506  for (ial=1; ial<tmpl->n_allele; ial++)
507  {
508  for (jal=1; jal<line->n_allele; jal++)
509  if ( !strcmp(tmpl->d.allele[ial], line->d.allele[jal]) ) { nmatch++; break; }
510  }
511  if ( nmatch==tmpl->n_allele ) { irec=i; break; } // found: exact match
512  continue;
513  }
514 
515  if ( line->n_allele==1 && tmpl->n_allele==1 ) { irec=i; break; } // both sites are non-variant
516 
517  // COLLAPSE_SOME: at least some ALTs must match
518  for (ial=1; ial<tmpl->n_allele; ial++)
519  {
520  for (jal=1; jal<line->n_allele; jal++)
521  if ( !strcmp(tmpl->d.allele[ial], line->d.allele[jal]) ) { irec=i; break; }
522  if ( irec>=1 ) break;
523  }
524  if ( irec>=1 ) break;
525  }
526  if ( irec==-1 ) return -1; // no matching line was found
527  }
528 
529  // Set the selected line (irec) as active: set it to buffer[0], move the remaining lines forward
530  // and put the old bcf1_t record at the end.
531  bcf1_t *tmp = reader->buffer[0];
532  reader->buffer[0] = reader->buffer[irec];
533  for (i=irec+1; i<=reader->nbuffer; i++) reader->buffer[i-1] = reader->buffer[i];
534  reader->buffer[ reader->nbuffer ] = tmp;
535  reader->nbuffer--;
536 
537  return 0;
538 }
539 
541 {
542  int i, min_pos = INT_MAX;
543 
544  // Loop until next suitable line is found or all readers have finished
545  while ( 1 )
546  {
547  // Get all readers ready for the next region.
548  if ( files->regions && _readers_next_region(files)<0 ) break;
549 
550  // Fill buffers
551  const char *chr = NULL;
552  for (i=0; i<files->nreaders; i++)
553  {
554  _reader_fill_buffer(files, &files->readers[i]);
555 
556  // Update the minimum coordinate
557  if ( !files->readers[i].nbuffer ) continue;
558  if ( min_pos > files->readers[i].buffer[1]->pos )
559  {
560  min_pos = files->readers[i].buffer[1]->pos;
561  chr = bcf_seqname(files->readers[i].header, files->readers[i].buffer[1]);
562  }
563  }
564  if ( min_pos==INT_MAX )
565  {
566  if ( !files->regions ) break;
567  continue;
568  }
569 
570  // Skip this position if not present in targets
571  if ( files->targets )
572  {
573  if ( bcf_sr_regions_overlap(files->targets, chr, min_pos, min_pos)<0 )
574  {
575  // Remove all lines with this position from the buffer
576  for (i=0; i<files->nreaders; i++)
577  if ( files->readers[i].nbuffer && files->readers[i].buffer[1]->pos==min_pos )
578  _reader_shift_buffer(&files->readers[i]);
579  min_pos = INT_MAX;
580  continue;
581  }
582  }
583 
584  break; // done: min_pos is set
585  }
586 
587  // There can be records with duplicate positions. Set the active line intelligently so that
588  // the alleles match.
589  int nret = 0; // number of readers sharing the position
590  bcf1_t *first = NULL; // record which will be used for allele matching
591  for (i=0; i<files->nreaders; i++)
592  {
593  files->has_line[i] = 0;
594 
595  // Skip readers with no records at this position
596  if ( !files->readers[i].nbuffer || files->readers[i].buffer[1]->pos!=min_pos ) continue;
597 
598  // Until now buffer[0] of all reader was empty and the lines started at buffer[1].
599  // Now lines which are ready to be output will be moved to buffer[0].
600  if ( _reader_match_alleles(files, &files->readers[i], first) < 0 ) continue;
601  if ( !first ) first = files->readers[i].buffer[0];
602 
603  nret++;
604  files->has_line[i] = 1;
605  }
606  return nret;
607 }
608 
610 {
611  if ( !files->targets_als )
612  return _reader_next_line(files);
613 
614  while (1)
615  {
616  int i, ret = _reader_next_line(files);
617  if ( !ret ) return ret;
618 
619  for (i=0; i<files->nreaders; i++)
620  if ( files->has_line[i] ) break;
621 
622  if ( _regions_match_alleles(files->targets, files->targets_als-1, files->readers[i].buffer[0]) ) return ret;
623 
624  // Check if there are more duplicate lines in the buffers. If not, return this line as if it
625  // matched the targets, even if there is a type mismatch
626  for (i=0; i<files->nreaders; i++)
627  {
628  if ( !files->has_line[i] ) continue;
629  if ( files->readers[i].nbuffer==0 || files->readers[i].buffer[1]->pos!=files->readers[i].buffer[0]->pos ) continue;
630  break;
631  }
632  if ( i==files->nreaders ) return ret; // no more lines left, output even if target alleles are not of the same type
633  }
634 }
635 
636 static void bcf_sr_seek_start(bcf_srs_t *readers)
637 {
638  bcf_sr_regions_t *reg = readers->regions;
639  int i;
640  for (i=0; i<reg->nseqs; i++)
641  reg->regs[i].creg = -1;
642  reg->iseq = 0;
643 }
644 
645 
646 int bcf_sr_seek(bcf_srs_t *readers, const char *seq, int pos)
647 {
648  if ( !seq && !pos )
649  {
650  // seek to start
651  bcf_sr_seek_start(readers);
652  return 0;
653  }
654 
655  bcf_sr_regions_overlap(readers->regions, seq, pos, pos);
656  int i, nret = 0;
657  for (i=0; i<readers->nreaders; i++)
658  {
659  nret += _reader_seek(&readers->readers[i],seq,pos,MAX_CSI_COOR-1);
660  }
661  return nret;
662 }
663 
664 int bcf_sr_set_samples(bcf_srs_t *files, const char *fname, int is_file)
665 {
666  int i, j, nsmpl, free_smpl = 0;
667  char **smpl = NULL;
668 
669  void *exclude = (fname[0]=='^') ? khash_str2int_init() : NULL;
670  if ( exclude || strcmp("-",fname) ) // "-" stands for all samples
671  {
672  smpl = hts_readlist(fname, is_file, &nsmpl);
673  if ( !smpl )
674  {
675  fprintf(stderr,"Could not read the file: \"%s\"\n", fname);
676  return 0;
677  }
678  if ( exclude )
679  {
680  for (i=0; i<nsmpl; i++)
681  khash_str2int_inc(exclude, smpl[i]);
682  }
683  free_smpl = 1;
684  }
685  if ( !smpl )
686  {
687  smpl = files->readers[0].header->samples; // intersection of all samples
688  nsmpl = bcf_hdr_nsamples(files->readers[0].header);
689  }
690 
691  files->samples = NULL;
692  files->n_smpl = 0;
693  for (i=0; i<nsmpl; i++)
694  {
695  if ( exclude && khash_str2int_has_key(exclude,smpl[i]) ) continue;
696 
697  int n_isec = 0;
698  for (j=0; j<files->nreaders; j++)
699  {
700  if ( bcf_hdr_id2int(files->readers[j].header, BCF_DT_SAMPLE, smpl[i])<0 ) break;
701  n_isec++;
702  }
703  if ( n_isec!=files->nreaders )
704  {
705  fprintf(stderr,"Warning: The sample \"%s\" was not found in %s, skipping\n", smpl[i], files->readers[n_isec].fname);
706  continue;
707  }
708 
709  files->samples = (char**) realloc(files->samples, (files->n_smpl+1)*sizeof(const char*));
710  files->samples[files->n_smpl++] = strdup(smpl[i]);
711  }
712 
713  if ( exclude ) khash_str2int_destroy(exclude);
714  if ( free_smpl )
715  {
716  for (i=0; i<nsmpl; i++) free(smpl[i]);
717  free(smpl);
718  }
719 
720  if ( !files->n_smpl )
721  {
722  if ( files->nreaders>1 )
723  fprintf(stderr,"No samples in common.\n");
724  return 0;
725  }
726  for (i=0; i<files->nreaders; i++)
727  {
728  bcf_sr_t *reader = &files->readers[i];
729  reader->samples = (int*) malloc(sizeof(int)*files->n_smpl);
730  reader->n_smpl = files->n_smpl;
731  for (j=0; j<files->n_smpl; j++)
732  reader->samples[j] = bcf_hdr_id2int(reader->header, BCF_DT_SAMPLE, files->samples[j]);
733  }
734  return 1;
735 }
736 
737 // Add a new region into a list sorted by start,end. On input the coordinates
738 // are 1-based, stored 0-based, inclusive.
739 static void _regions_add(bcf_sr_regions_t *reg, const char *chr, int start, int end)
740 {
741  if ( start==-1 && end==-1 )
742  {
743  start = 0; end = MAX_CSI_COOR-1;
744  }
745  else
746  {
747  start--; end--; // store 0-based coordinates
748  }
749 
750  if ( !reg->seq_hash )
751  reg->seq_hash = khash_str2int_init();
752 
753  int iseq;
754  if ( khash_str2int_get(reg->seq_hash, chr, &iseq)<0 )
755  {
756  // the chromosome block does not exist
757  iseq = reg->nseqs++;
758  reg->seq_names = (char**) realloc(reg->seq_names,sizeof(char*)*reg->nseqs);
759  reg->regs = (region_t*) realloc(reg->regs,sizeof(region_t)*reg->nseqs);
760  memset(&reg->regs[reg->nseqs-1],0,sizeof(region_t));
761  reg->seq_names[iseq] = strdup(chr);
762  reg->regs[iseq].creg = -1;
763  khash_str2int_set(reg->seq_hash,reg->seq_names[iseq],iseq);
764  }
765 
766  region_t *creg = &reg->regs[iseq];
767 
768  // the regions may not be sorted on input: binary search
769  int i, min = 0, max = creg->nregs - 1;
770  while ( min<=max )
771  {
772  i = (max+min)/2;
773  if ( start < creg->regs[i].start ) max = i - 1;
774  else if ( start > creg->regs[i].start ) min = i + 1;
775  else break;
776  }
777  if ( min>max || creg->regs[i].start!=start || creg->regs[i].end!=end )
778  {
779  // no such region, insert a new one just after max
780  hts_expand(region1_t,creg->nregs+1,creg->mregs,creg->regs);
781  if ( ++max < creg->nregs )
782  memmove(&creg->regs[max+1],&creg->regs[max],(creg->nregs - max)*sizeof(region1_t));
783  creg->regs[max].start = start;
784  creg->regs[max].end = end;
785  creg->nregs++;
786  }
787 }
788 
789 // File name or a list of genomic locations. If file name, NULL is returned.
790 static bcf_sr_regions_t *_regions_init_string(const char *str)
791 {
792  bcf_sr_regions_t *reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t));
793  reg->start = reg->end = -1;
794  reg->prev_start = reg->prev_seq = -1;
795 
796  kstring_t tmp = {0,0,0};
797  const char *sp = str, *ep = str;
798  int from, to;
799  while ( 1 )
800  {
801  while ( *ep && *ep!=',' && *ep!=':' ) ep++;
802  tmp.l = 0;
803  kputsn(sp,ep-sp,&tmp);
804  if ( *ep==':' )
805  {
806  sp = ep+1;
807  from = strtol(sp,(char**)&ep,10);
808  if ( sp==ep )
809  {
810  fprintf(stderr,"[%s:%d %s] Could not parse the region(s): %s\n", __FILE__,__LINE__,__FUNCTION__,str);
811  free(reg); free(tmp.s); return NULL;
812  }
813  if ( !*ep || *ep==',' )
814  {
815  _regions_add(reg, tmp.s, from, from);
816  sp = ep;
817  continue;
818  }
819  if ( *ep!='-' )
820  {
821  fprintf(stderr,"[%s:%d %s] Could not parse the region(s): %s\n", __FILE__,__LINE__,__FUNCTION__,str);
822  free(reg); free(tmp.s); return NULL;
823  }
824  ep++;
825  sp = ep;
826  to = strtol(sp,(char**)&ep,10);
827  if ( *ep && *ep!=',' )
828  {
829  fprintf(stderr,"[%s:%d %s] Could not parse the region(s): %s\n", __FILE__,__LINE__,__FUNCTION__,str);
830  free(reg); free(tmp.s); return NULL;
831  }
832  if ( sp==ep ) to = MAX_CSI_COOR-1;
833  _regions_add(reg, tmp.s, from, to);
834  if ( !*ep ) break;
835  sp = ep;
836  }
837  else
838  {
839  if ( tmp.l ) _regions_add(reg, tmp.s, -1, -1);
840  if ( !*ep ) break;
841  sp = ++ep;
842  }
843  }
844  free(tmp.s);
845  return reg;
846 }
847 
848 // ichr,ifrom,ito are 0-based;
849 // returns -1 on error, 0 if the line is a comment line, 1 on success
850 static int _regions_parse_line(char *line, int ichr,int ifrom,int ito, char **chr,char **chr_end,int *from,int *to)
851 {
852  *chr_end = NULL;
853 
854  if ( line[0]=='#' ) return 0;
855 
856  int k,l; // index of the start and end column of the tab-delimited file
857  if ( ifrom <= ito )
858  k = ifrom, l = ito;
859  else
860  l = ifrom, k = ito;
861 
862  int i;
863  char *se = line, *ss = NULL; // start and end
864  char *tmp;
865  for (i=0; i<=k && *se; i++)
866  {
867  ss = i==0 ? se++ : ++se;
868  while (*se && *se!='\t') se++;
869  }
870  if ( i<=k ) return -1;
871  if ( k==l )
872  {
873  *from = *to = strtol(ss, &tmp, 10);
874  if ( tmp==ss ) return -1;
875  }
876  else
877  {
878  if ( k==ifrom )
879  *from = strtol(ss, &tmp, 10);
880  else
881  *to = strtol(ss, &tmp, 10);
882  if ( ss==tmp ) return -1;
883 
884  for (i=k; i<l && *se; i++)
885  {
886  ss = ++se;
887  while (*se && *se!='\t') se++;
888  }
889  if ( i<l ) return -1;
890  if ( k==ifrom )
891  *to = strtol(ss, &tmp, 10);
892  else
893  *from = strtol(ss, &tmp, 10);
894  if ( ss==tmp ) return -1;
895  }
896 
897  ss = se = line;
898  for (i=0; i<=ichr && *se; i++)
899  {
900  if ( i>0 ) ss = ++se;
901  while (*se && *se!='\t') se++;
902  }
903  if ( i<=ichr ) return -1;
904  *chr_end = se;
905  *chr = ss;
906  return 1;
907 }
908 
909 bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int ichr, int ifrom, int ito)
910 {
911  bcf_sr_regions_t *reg;
912  if ( !is_file ) return _regions_init_string(regions);
913 
914  reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t));
915  reg->start = reg->end = -1;
916  reg->prev_start = reg->prev_seq = -1;
917 
918  reg->file = hts_open(regions, "rb");
919  if ( !reg->file )
920  {
921  fprintf(stderr,"[%s:%d %s] Could not open file: %s\n", __FILE__,__LINE__,__FUNCTION__,regions);
922  free(reg);
923  return NULL;
924  }
925 
926  reg->tbx = tbx_index_load(regions);
927  if ( !reg->tbx )
928  {
929  int len = strlen(regions);
930  int is_bed = strcasecmp(".bed",regions+len-4) ? 0 : 1;
931  if ( !is_bed && !strcasecmp(".bed.gz",regions+len-7) ) is_bed = 1;
932  int ft_type = hts_file_type(regions);
933  if ( ft_type & FT_VCF ) ito = 1;
934 
935  // read the whole file, tabix index is not present
936  while ( hts_getline(reg->file, KS_SEP_LINE, &reg->line) > 0 )
937  {
938  char *chr, *chr_end;
939  int from, to, ret;
940  ret = _regions_parse_line(reg->line.s, ichr,ifrom,abs(ito), &chr,&chr_end,&from,&to);
941  if ( ret < 0 )
942  {
943  if ( ito<0 )
944  ret = _regions_parse_line(reg->line.s, ichr,ifrom,ifrom, &chr,&chr_end,&from,&to);
945  if ( ret<0 )
946  {
947  fprintf(stderr,"[%s:%d] Could not parse the file %s, using the columns %d,%d[,%d]\n", __FILE__,__LINE__,regions,ichr+1,ifrom+1,ito+1);
948  hts_close(reg->file); reg->file = NULL; free(reg);
949  return NULL;
950  }
951  }
952  if ( !ret ) continue;
953  if ( is_bed ) from++;
954  *chr_end = 0;
955  _regions_add(reg, chr, from, to);
956  *chr_end = '\t';
957  }
958  hts_close(reg->file); reg->file = NULL;
959  if ( !reg->nseqs ) { free(reg); return NULL; }
960  return reg;
961  }
962 
963  reg->seq_names = (char**) tbx_seqnames(reg->tbx, &reg->nseqs);
964  if ( !reg->seq_hash )
965  reg->seq_hash = khash_str2int_init();
966  int i;
967  for (i=0; i<reg->nseqs; i++)
968  {
969  khash_str2int_set(reg->seq_hash,reg->seq_names[i],i);
970  }
971  reg->fname = strdup(regions);
972  reg->is_bin = 1;
973  return reg;
974 }
975 
977 {
978  int i;
979  free(reg->fname);
980  if ( reg->itr ) tbx_itr_destroy(reg->itr);
981  if ( reg->tbx ) tbx_destroy(reg->tbx);
982  if ( reg->file ) hts_close(reg->file);
983  if ( reg->als ) free(reg->als);
984  if ( reg->als_str.s ) free(reg->als_str.s);
985  free(reg->line.s);
986  if ( reg->regs )
987  {
988  // free only in-memory names, tbx names are const
989  for (i=0; i<reg->nseqs; i++)
990  {
991  free(reg->seq_names[i]);
992  free(reg->regs[i].regs);
993  }
994  }
995  free(reg->regs);
996  free(reg->seq_names);
997  khash_str2int_destroy(reg->seq_hash);
998  free(reg);
999 }
1000 
1001 int bcf_sr_regions_seek(bcf_sr_regions_t *reg, const char *seq)
1002 {
1003  reg->iseq = reg->start = reg->end = -1;
1004  if ( khash_str2int_get(reg->seq_hash, seq, &reg->iseq) < 0 ) return -1; // sequence seq not in regions
1005 
1006  // using in-memory regions
1007  if ( reg->regs ) return 0;
1008 
1009  // reading regions from tabix
1010  if ( reg->itr ) tbx_itr_destroy(reg->itr);
1011  reg->itr = tbx_itr_querys(reg->tbx, seq);
1012  if ( reg->itr ) return 0;
1013 
1014  return -1;
1015 }
1016 
1018 {
1019  if ( reg->iseq<0 ) return -1;
1020  reg->start = reg->end = -1;
1021  reg->nals = 0;
1022 
1023  // using in-memory regions
1024  if ( reg->regs )
1025  {
1026  while ( reg->iseq < reg->nseqs )
1027  {
1028  reg->regs[reg->iseq].creg++;
1029  if ( reg->regs[reg->iseq].creg < reg->regs[reg->iseq].nregs ) break;
1030  reg->iseq++;
1031  }
1032  if ( reg->iseq >= reg->nseqs ) { reg->iseq = -1; return -1; } // no more regions left
1033  region1_t *creg = &reg->regs[reg->iseq].regs[reg->regs[reg->iseq].creg];
1034  reg->start = creg->start;
1035  reg->end = creg->end;
1036  return 0;
1037  }
1038 
1039  // reading from tabix
1040  char *chr, *chr_end;
1041  int ichr = 0, ifrom = 1, ito = 2, is_bed = 0, from, to;
1042  if ( reg->tbx )
1043  {
1044  ichr = reg->tbx->conf.sc-1;
1045  ifrom = reg->tbx->conf.bc-1;
1046  ito = reg->tbx->conf.ec-1;
1047  if ( ito<0 ) ito = ifrom;
1048  is_bed = reg->tbx->conf.preset==TBX_UCSC ? 1 : 0;
1049  }
1050 
1051  int ret = 0;
1052  while ( !ret )
1053  {
1054  if ( reg->itr )
1055  {
1056  // tabix index present, reading a chromosome block
1057  ret = tbx_itr_next(reg->file, reg->tbx, reg->itr, &reg->line);
1058  if ( ret<0 ) { reg->iseq = -1; return -1; }
1059  }
1060  else
1061  {
1062  if ( reg->is_bin )
1063  {
1064  // Waited for seek which never came. Reopen in text mode and stream
1065  // through the regions, otherwise hts_getline would fail
1066  hts_close(reg->file);
1067  reg->file = hts_open(reg->fname, "r");
1068  if ( !reg->file )
1069  {
1070  fprintf(stderr,"[%s:%d %s] Could not open file: %s\n", __FILE__,__LINE__,__FUNCTION__,reg->fname);
1071  reg->file = NULL;
1073  return -1;
1074  }
1075  reg->is_bin = 0;
1076  }
1077 
1078  // tabix index absent, reading the whole file
1079  ret = hts_getline(reg->file, KS_SEP_LINE, &reg->line);
1080  if ( ret<0 ) { reg->iseq = -1; return -1; }
1081  }
1082  ret = _regions_parse_line(reg->line.s, ichr,ifrom,ito, &chr,&chr_end,&from,&to);
1083  if ( ret<0 )
1084  {
1085  fprintf(stderr,"[%s:%d] Could not parse the file %s, using the columns %d,%d,%d\n", __FILE__,__LINE__,reg->fname,ichr+1,ifrom+1,ito+1);
1086  return -1;
1087  }
1088  }
1089  if ( is_bed ) from++;
1090 
1091  *chr_end = 0;
1092  if ( khash_str2int_get(reg->seq_hash, chr, &reg->iseq)<0 )
1093  {
1094  fprintf(stderr,"Broken tabix index? The sequence \"%s\" not in dictionary [%s]\n", chr,reg->line.s);
1095  exit(1);
1096  }
1097  *chr_end = '\t';
1098 
1099  reg->start = from - 1;
1100  reg->end = to - 1;
1101  return 0;
1102 }
1103 
1104 static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *rec)
1105 {
1106  int i = 0, max_len = 0;
1107  if ( !reg->nals )
1108  {
1109  char *ss = reg->line.s;
1110  while ( i<als_idx && *ss )
1111  {
1112  if ( *ss=='\t' ) i++;
1113  ss++;
1114  }
1115  char *se = ss;
1116  reg->nals = 1;
1117  while ( *se && *se!='\t' )
1118  {
1119  if ( *se==',' ) reg->nals++;
1120  se++;
1121  }
1122  ks_resize(&reg->als_str, se-ss+1+reg->nals);
1123  reg->als_str.l = 0;
1124  hts_expand(char*,reg->nals,reg->mals,reg->als);
1125  reg->nals = 0;
1126 
1127  se = ss;
1128  while ( *(++se) )
1129  {
1130  if ( *se=='\t' ) break;
1131  if ( *se!=',' ) continue;
1132  reg->als[reg->nals] = &reg->als_str.s[reg->als_str.l];
1133  kputsn(ss,se-ss,&reg->als_str);
1134  if ( &reg->als_str.s[reg->als_str.l] - reg->als[reg->nals] > max_len ) max_len = &reg->als_str.s[reg->als_str.l] - reg->als[reg->nals];
1135  reg->als_str.l++;
1136  reg->nals++;
1137  ss = ++se;
1138  }
1139  reg->als[reg->nals] = &reg->als_str.s[reg->als_str.l];
1140  kputsn(ss,se-ss,&reg->als_str);
1141  if ( &reg->als_str.s[reg->als_str.l] - reg->als[reg->nals] > max_len ) max_len = &reg->als_str.s[reg->als_str.l] - reg->als[reg->nals];
1142  reg->nals++;
1143  reg->als_type = max_len > 1 ? VCF_INDEL : VCF_SNP; // this is a simplified check, see vcf.c:bcf_set_variant_types
1144  }
1145  int type = bcf_get_variant_types(rec);
1146  if ( reg->als_type & VCF_INDEL )
1147  return type & VCF_INDEL ? 1 : 0;
1148  return !(type & VCF_INDEL) ? 1 : 0;
1149 }
1150 
1151 int bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, int start, int end)
1152 {
1153  int iseq;
1154  if ( khash_str2int_get(reg->seq_hash, seq, &iseq)<0 ) return -1; // no such sequence
1155 
1156  if ( reg->prev_seq==-1 || iseq!=reg->prev_seq || reg->prev_start > start ) // new chromosome or after a seek
1157  {
1158  // flush regions left on previous chromosome
1159  if ( reg->missed_reg_handler && reg->prev_seq!=-1 && reg->iseq!=-1 )
1160  bcf_sr_regions_flush(reg);
1161 
1162  bcf_sr_regions_seek(reg, seq);
1163  reg->start = reg->end = -1;
1164  }
1165  if ( reg->prev_seq==iseq && reg->iseq!=iseq ) return -2; // no more regions on this chromosome
1166  reg->prev_seq = reg->iseq;
1167  reg->prev_start = start;
1168 
1169  while ( iseq==reg->iseq && reg->end < start )
1170  {
1171  if ( bcf_sr_regions_next(reg) < 0 ) return -2; // no more regions left
1172  if ( reg->iseq != iseq ) return -1; // does not overlap any regions
1173  if ( reg->missed_reg_handler && reg->end < start ) reg->missed_reg_handler(reg, reg->missed_reg_data);
1174  }
1175  if ( reg->start <= end ) return 0; // region overlap
1176  return -1; // no overlap
1177 }
1178 
1180 {
1181  if ( !reg->missed_reg_handler || reg->prev_seq==-1 ) return;
1182  while ( !bcf_sr_regions_next(reg) ) reg->missed_reg_handler(reg, reg->missed_reg_data);
1183  return;
1184 }
1185