NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
vcf_sweep.c
Go to the documentation of this file.
1 #include "htslib/vcf_sweep.h"
2 #include "htslib/bgzf.h"
3 
4 #define SW_FWD 0
5 #define SW_BWD 1
6 
8 {
11  BGZF *fp;
12 
13  int direction; // to tell if the direction has changed
14  int block_size; // the size of uncompressed data to hold in memory
15  bcf1_t *rec; // bcf buffer
16  int nrec, mrec; // number of used records; total size of the buffer
17  int lrid, lpos, lnals, lals_len, mlals; // to check uniqueness of a record
18  char *lals;
19 
20  uint64_t *idx; // uncompressed offsets of VCF/BCF records
21  int iidx, nidx, midx; // i: current offset; n: used; m: allocated
22  int idx_done; // the index is built during the first pass
23 };
24 
26 int hts_useek(htsFile *file, long uoffset, int where);
27 long hts_utell(htsFile *file);
28 
29 static inline int sw_rec_equal(bcf_sweep_t *sw, bcf1_t *rec)
30 {
31  if ( sw->lrid!=rec->rid ) return 0;
32  if ( sw->lpos!=rec->pos ) return 0;
33  if ( sw->lnals!=rec->n_allele ) return 0;
34 
35  char *t = rec->d.allele[sw->lnals-1];
36  int len = t - rec->d.allele[0] + 1;
37  while ( *t ) { t++; len++; }
38  if ( sw->lals_len!=len ) return 0;
39  if ( memcmp(sw->lals,rec->d.allele[0],len) ) return 0;
40  return 1;
41 }
42 
43 static void sw_rec_save(bcf_sweep_t *sw, bcf1_t *rec)
44 {
45  sw->lrid = rec->rid;
46  sw->lpos = rec->pos;
47  sw->lnals = rec->n_allele;
48 
49  char *t = rec->d.allele[sw->lnals-1];
50  int len = t - rec->d.allele[0] + 1;
51  while ( *t ) { t++; len++; }
52  sw->lals_len = len;
53  hts_expand(char, len, sw->mlals, sw->lals);
54  memcpy(sw->lals, rec->d.allele[0], len);
55 }
56 
57 static void sw_fill_buffer(bcf_sweep_t *sw)
58 {
59  if ( !sw->iidx ) return;
60  sw->iidx--;
61 
62  int ret = hts_useek(sw->file, sw->idx[sw->iidx], 0);
63  assert( ret==0 );
64 
65  sw->nrec = 0;
66  bcf1_t *rec = &sw->rec[sw->nrec];
67  while ( (ret=bcf_read1(sw->file, sw->hdr, rec))==0 )
68  {
69  bcf_unpack(rec, BCF_UN_STR);
70 
71  // if not in the last block, stop at the saved record
72  if ( sw->iidx+1 < sw->nidx && sw_rec_equal(sw,rec) ) break;
73 
74  sw->nrec++;
75  hts_expand0(bcf1_t, sw->nrec+1, sw->mrec, sw->rec);
76  rec = &sw->rec[sw->nrec];
77  }
78  sw_rec_save(sw, &sw->rec[0]);
79 }
80 
81 bcf_sweep_t *bcf_sweep_init(const char *fname)
82 {
83  bcf_sweep_t *sw = (bcf_sweep_t*) calloc(1,sizeof(bcf_sweep_t));
84  sw->file = hts_open(fname, "r");
85  sw->fp = hts_get_bgzfp(sw->file);
87  sw->hdr = bcf_hdr_read(sw->file);
88  sw->mrec = 1;
89  sw->rec = (bcf1_t*) calloc(sw->mrec,(sizeof(bcf1_t)));
90  sw->block_size = 1024*1024*3;
91  sw->direction = SW_FWD;
92  return sw;
93 }
94 
95 void bcf_empty1(bcf1_t *v);
97 {
98  int i;
99  for (i=0; i<sw->mrec; i++) bcf_empty1(&sw->rec[i]);
100  free(sw->idx);
101  free(sw->rec);
102  free(sw->lals);
103  bcf_hdr_destroy(sw->hdr);
104  hts_close(sw->file);
105  free(sw);
106 }
107 
108 static void sw_seek(bcf_sweep_t *sw, int direction)
109 {
110  sw->direction = direction;
111  if ( direction==SW_FWD )
112  hts_useek(sw->file, sw->idx[0], 0);
113  else
114  {
115  sw->iidx = sw->nidx;
116  sw->nrec = 0;
117  }
118 }
119 
121 {
122  if ( sw->direction==SW_BWD ) sw_seek(sw, SW_FWD);
123 
124  long pos = hts_utell(sw->file);
125 
126  bcf1_t *rec = &sw->rec[0];
127  int ret = bcf_read1(sw->file, sw->hdr, rec);
128 
129  if ( ret!=0 ) // last record, get ready for sweeping backwards
130  {
131  sw->idx_done = 1;
132  sw->fp->idx_build_otf = 0;
133  sw_seek(sw, SW_BWD);
134  return NULL;
135  }
136 
137  if ( !sw->idx_done )
138  {
139  if ( !sw->nidx || pos - sw->idx[sw->nidx-1] > sw->block_size )
140  {
141  sw->nidx++;
142  hts_expand(uint64_t, sw->nidx, sw->midx, sw->idx);
143  sw->idx[sw->nidx-1] = pos;
144  }
145  }
146  return rec;
147 }
148 
150 {
151  if ( sw->direction==SW_FWD ) sw_seek(sw, SW_BWD);
152  if ( !sw->nrec ) sw_fill_buffer(sw);
153  if ( !sw->nrec ) return NULL;
154  return &sw->rec[ --sw->nrec ];
155 }
156 
157 bcf_hdr_t *bcf_sweep_hdr(bcf_sweep_t *sw) { return sw->hdr; }
158