NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
tbx.c
Go to the documentation of this file.
1 #include <stdlib.h>
2 #include <string.h>
3 #include <ctype.h>
4 #include <stdio.h>
5 #include <assert.h>
6 #include "htslib/tbx.h"
7 #include "htslib/bgzf.h"
8 
9 #include "htslib/khash.h"
11 
12 tbx_conf_t tbx_conf_gff = { 0, 1, 4, 5, '#', 0 };
13 tbx_conf_t tbx_conf_bed = { TBX_UCSC, 1, 2, 3, '#', 0 };
14 tbx_conf_t tbx_conf_psltbl = { TBX_UCSC, 15, 17, 18, '#', 0 };
15 tbx_conf_t tbx_conf_sam = { TBX_SAM, 3, 4, 0, '@', 0 };
16 tbx_conf_t tbx_conf_vcf = { TBX_VCF, 1, 2, 0, '#', 0 };
17 
18 typedef struct {
19  int64_t beg, end;
20  char *ss, *se;
21  int tid;
22 } tbx_intv_t;
23 
24 static inline int get_tid(tbx_t *tbx, const char *ss, int is_add)
25 {
26  khint_t k;
27  khash_t(s2i) *d;
28  if (tbx->dict == 0) tbx->dict = kh_init(s2i);
29  d = (khash_t(s2i)*)tbx->dict;
30  if (is_add) {
31  int absent;
32  k = kh_put(s2i, d, ss, &absent);
33  if (absent) {
34  kh_key(d, k) = strdup(ss);
35  kh_val(d, k) = kh_size(d) - 1;
36  }
37  } else k = kh_get(s2i, d, ss);
38  return k == kh_end(d)? -1 : kh_val(d, k);
39 }
40 
41 int tbx_name2id(tbx_t *tbx, const char *ss)
42 {
43  return get_tid(tbx, ss, 0);
44 }
45 
46 int tbx_parse1(const tbx_conf_t *conf, int len, char *line, tbx_intv_t *intv)
47 {
48  int i, b = 0, id = 1, ncols = 0;
49  char *s;
50  intv->ss = intv->se = 0; intv->beg = intv->end = -1;
51  for (i = 0; i <= len; ++i) {
52  if (line[i] == '\t' || line[i] == 0) {
53  ++ncols;
54  if (id == conf->sc) {
55  intv->ss = line + b; intv->se = line + i;
56  } else if (id == conf->bc) {
57  // here ->beg is 0-based.
58  intv->beg = intv->end = strtol(line + b, &s, 0);
59  if ( s==line+b ) return -1; // expected int
60  if (!(conf->preset&TBX_UCSC)) --intv->beg;
61  else ++intv->end;
62  if (intv->beg < 0) intv->beg = 0;
63  if (intv->end < 1) intv->end = 1;
64  } else {
65  if ((conf->preset&0xffff) == TBX_GENERIC) {
66  if (id == conf->ec)
67  {
68  intv->end = strtol(line + b, &s, 0);
69  if ( s==line+b ) return -1; // expected int
70  }
71  } else if ((conf->preset&0xffff) == TBX_SAM) {
72  if (id == 6) { // CIGAR
73  int l = 0, op;
74  char *t;
75  for (s = line + b; s < line + i;) {
76  long x = strtol(s, &t, 10);
77  op = toupper(*t);
78  if (op == 'M' || op == 'D' || op == 'N') l += x;
79  s = t + 1;
80  }
81  if (l == 0) l = 1;
82  intv->end = intv->beg + l;
83  }
84  } else if ((conf->preset&0xffff) == TBX_VCF) {
85  if (id == 4) {
86  if (b < i) intv->end = intv->beg + (i - b);
87  } else if (id == 8) { // look for "END="
88  int c = line[i];
89  line[i] = 0;
90  s = strstr(line + b, "END=");
91  if (s == line + b) s += 4;
92  else if (s) {
93  s = strstr(line + b, ";END=");
94  if (s) s += 5;
95  }
96  if (s) intv->end = strtol(s, &s, 0);
97  line[i] = c;
98  }
99  }
100  }
101  b = i + 1;
102  ++id;
103  }
104  }
105  if (intv->ss == 0 || intv->se == 0 || intv->beg < 0 || intv->end < 0) return -1;
106  return 0;
107 }
108 
109 static inline int get_intv(tbx_t *tbx, kstring_t *str, tbx_intv_t *intv, int is_add)
110 {
111  if (tbx_parse1(&tbx->conf, str->l, str->s, intv) == 0) {
112  int c = *intv->se;
113  *intv->se = '\0'; intv->tid = get_tid(tbx, intv->ss, is_add); *intv->se = c;
114  return (intv->tid >= 0 && intv->beg >= 0 && intv->end >= 0)? 0 : -1;
115  } else {
116  char *type = NULL;
117  switch (tbx->conf.preset&0xffff)
118  {
119  case TBX_SAM: type = "TBX_SAM"; break;
120  case TBX_VCF: type = "TBX_VCF"; break;
121  case TBX_UCSC: type = "TBX_UCSC"; break;
122  default: type = "TBX_GENERIC"; break;
123  }
124  fprintf(stderr, "[E::%s] failed to parse %s, was wrong -p [type] used?\nThe offending line was: \"%s\"\n", __func__, type, str->s);
125  return -1;
126  }
127 }
128 
129 int tbx_readrec(BGZF *fp, void *tbxv, void *sv, int *tid, int *beg, int *end)
130 {
131  tbx_t *tbx = (tbx_t *) tbxv;
132  kstring_t *s = (kstring_t *) sv;
133  int ret;
134  if ((ret = bgzf_getline(fp, '\n', s)) >= 0) {
135  tbx_intv_t intv;
136  get_intv(tbx, s, &intv, 0);
137  *tid = intv.tid; *beg = intv.beg; *end = intv.end;
138  }
139  return ret;
140 }
141 
142 void tbx_set_meta(tbx_t *tbx)
143 {
144  int i, l = 0, l_nm;
145  uint32_t x[7];
146  char **name;
147  uint8_t *meta;
148  khint_t k;
149  khash_t(s2i) *d = (khash_t(s2i)*)tbx->dict;
150 
151  memcpy(x, &tbx->conf, 24);
152  name = (char**)malloc(sizeof(char*) * kh_size(d));
153  for (k = kh_begin(d), l = 0; k != kh_end(d); ++k) {
154  if (!kh_exist(d, k)) continue;
155  name[kh_val(d, k)] = (char*)kh_key(d, k);
156  l += strlen(kh_key(d, k)) + 1; // +1 to include '\0'
157  }
158  l_nm = x[6] = l;
159  meta = (uint8_t*)malloc(l_nm + 28);
160  if (ed_is_big())
161  for (i = 0; i < 7; ++i)
162  x[i] = ed_swap_4(x[i]);
163  memcpy(meta, x, 28);
164  for (l = 28, i = 0; i < (int)kh_size(d); ++i) {
165  int x = strlen(name[i]) + 1;
166  memcpy(meta + l, name[i], x);
167  l += x;
168  }
169  free(name);
170  hts_idx_set_meta(tbx->idx, l, meta, 0);
171 }
172 
173 tbx_t *tbx_index(BGZF *fp, int min_shift, const tbx_conf_t *conf)
174 {
175  tbx_t *tbx;
176  kstring_t str;
177  int ret, first = 0, n_lvls, fmt;
178  int64_t lineno = 0;
179  uint64_t last_off = 0;
180  tbx_intv_t intv;
181 
182  str.s = 0; str.l = str.m = 0;
183  tbx = (tbx_t*)calloc(1, sizeof(tbx_t));
184  tbx->conf = *conf;
185  if (min_shift > 0) n_lvls = (TBX_MAX_SHIFT - min_shift + 2) / 3, fmt = HTS_FMT_CSI;
186  else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_TBI;
187  while ((ret = bgzf_getline(fp, '\n', &str)) >= 0) {
188  ++lineno;
189  if (lineno <= tbx->conf.line_skip || str.s[0] == tbx->conf.meta_char) {
190  last_off = bgzf_tell(fp);
191  continue;
192  }
193  if (first == 0) {
194  tbx->idx = hts_idx_init(0, fmt, last_off, min_shift, n_lvls);
195  first = 1;
196  }
197  get_intv(tbx, &str, &intv, 1);
198  ret = hts_idx_push(tbx->idx, intv.tid, intv.beg, intv.end, bgzf_tell(fp), 1);
199  if (ret < 0)
200  {
201  free(str.s);
202  tbx_destroy(tbx);
203  return NULL;
204  }
205  }
206  if ( !tbx->idx ) tbx->idx = hts_idx_init(0, fmt, last_off, min_shift, n_lvls); // empty file
207  if ( !tbx->dict ) tbx->dict = kh_init(s2i);
208  hts_idx_finish(tbx->idx, bgzf_tell(fp));
209  tbx_set_meta(tbx);
210  free(str.s);
211  return tbx;
212 }
213 
214 void tbx_destroy(tbx_t *tbx)
215 {
216  khash_t(s2i) *d = (khash_t(s2i)*)tbx->dict;
217  if (d != NULL)
218  {
219  khint_t k;
220  for (k = kh_begin(d); k != kh_end(d); ++k)
221  if (kh_exist(d, k)) free((char*)kh_key(d, k));
222  }
223  hts_idx_destroy(tbx->idx);
224  kh_destroy(s2i, d);
225  free(tbx);
226 }
227 
228 int tbx_index_build(const char *fn, int min_shift, const tbx_conf_t *conf)
229 {
230  tbx_t *tbx;
231  BGZF *fp;
232  if ( bgzf_is_bgzf(fn)!=1 ) { fprintf(stderr,"Not a BGZF file: %s\n", fn); return -1; }
233  if ((fp = bgzf_open(fn, "r")) == 0) return -1;
234  if ( !fp->is_compressed ) { bgzf_close(fp); return -1; }
235  tbx = tbx_index(fp, min_shift, conf);
236  bgzf_close(fp);
237  if ( !tbx ) return -1;
238  hts_idx_save(tbx->idx, fn, min_shift > 0? HTS_FMT_CSI : HTS_FMT_TBI);
239  tbx_destroy(tbx);
240  return 0;
241 }
242 
243 tbx_t *tbx_index_load(const char *fn)
244 {
245  tbx_t *tbx;
246  uint8_t *meta;
247  char *nm, *p;
248  uint32_t x[7];
249  int l_meta, l_nm;
250  tbx = (tbx_t*)calloc(1, sizeof(tbx_t));
251  tbx->idx = hts_idx_load(fn, HTS_FMT_TBI);
252  if ( !tbx->idx )
253  {
254  free(tbx);
255  return NULL;
256  }
257  meta = hts_idx_get_meta(tbx->idx, &l_meta);
258  memcpy(x, meta, 28);
259  memcpy(&tbx->conf, x, 24);
260  p = nm = (char*)meta + 28;
261  l_nm = x[6];
262  for (; p - nm < l_nm; p += strlen(p) + 1) get_tid(tbx, p, 1);
263  return tbx;
264 }
265 
266 const char **tbx_seqnames(tbx_t *tbx, int *n)
267 {
268  khash_t(s2i) *d = (khash_t(s2i)*)tbx->dict;
269  if (d == NULL)
270  {
271  *n = 0;
272  return NULL;
273  }
274  int tid, m = kh_size(d);
275  const char **names = (const char**) calloc(m,sizeof(const char*));
276  khint_t k;
277  for (k=kh_begin(d); k<kh_end(d); k++)
278  {
279  if ( !kh_exist(d,k) ) continue;
280  tid = kh_val(d,k);
281  assert( tid<m );
282  names[tid] = kh_key(d,k);
283  }
284  // sanity check: there should be no gaps
285  for (tid=0; tid<m; tid++)
286  assert(names[tid]);
287  *n = m;
288  return names;
289 }
290