NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
cram_index.c
Go to the documentation of this file.
1 /*
2 Copyright (c) 2013-2014 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4 
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7 
8  1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10 
11  2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14 
15  3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18 
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30 
31 /*
32  * The index is a gzipped tab-delimited text file with one line per slice.
33  * The columns are:
34  * 1: reference number (0 to N-1, as per BAM ref_id)
35  * 2: reference position of 1st read in slice (1..?)
36  * 3: number of reads in slice
37  * 4: offset of container start (relative to end of SAM header, so 1st
38  * container is offset 0).
39  * 5: slice number within container (ie which landmark).
40  *
41  * In memory, we hold this in a nested containment list. Each list element is
42  * a cram_index struct. Each element in turn can contain its own list of
43  * cram_index structs.
44  *
45  * Any start..end range which is entirely contained within another (and
46  * earlier as it is sorted) range will be held within it. This ensures that
47  * the outer list will never have containments and we can safely do a
48  * binary search to find the first range which overlaps any given coordinate.
49  */
50 
51 #ifdef HAVE_CONFIG_H
52 #include "io_lib_config.h"
53 #endif
54 
55 #include <stdio.h>
56 #include <errno.h>
57 #include <assert.h>
58 #include <stdlib.h>
59 #include <string.h>
60 #include <zlib.h>
61 #include <sys/types.h>
62 #include <sys/stat.h>
63 #include <math.h>
64 #include <ctype.h>
65 
66 #include "htslib/hfile.h"
67 #include "cram/cram.h"
68 #include "cram/os.h"
69 #include "cram/zfio.h"
70 
71 #if 0
72 static void dump_index_(cram_index *e, int level) {
73  int i, n;
74  n = printf("%*s%d / %d .. %d, ", level*4, "", e->refid, e->start, e->end);
75  printf("%*soffset %"PRId64"\n", MAX(0,50-n), "", e->offset);
76  for (i = 0; i < e->nslice; i++) {
77  dump_index_(&e->e[i], level+1);
78  }
79 }
80 
81 static void dump_index(cram_fd *fd) {
82  int i;
83  for (i = 0; i < fd->index_sz; i++) {
84  dump_index_(&fd->index[i], 0);
85  }
86 }
87 #endif
88 
89 /*
90  * Loads a CRAM .crai index into memory.
91  *
92  * Returns 0 for success
93  * -1 for failure
94  */
95 int cram_index_load(cram_fd *fd, const char *fn) {
96  char fn2[PATH_MAX];
97  char buf[65536];
98  ssize_t len;
99  kstring_t kstr = {0};
100  hFILE *fp;
101  cram_index *idx;
102  cram_index **idx_stack = NULL, *ep, e;
103  int idx_stack_alloc = 0, idx_stack_ptr = 0;
104  size_t pos = 0;
105 
106  /* Check if already loaded */
107  if (fd->index)
108  return 0;
109 
110  fd->index = calloc((fd->index_sz = 1), sizeof(*fd->index));
111  if (!fd->index)
112  return -1;
113 
114  idx = &fd->index[0];
115  idx->refid = -1;
116  idx->start = INT_MIN;
117  idx->end = INT_MAX;
118 
119  idx_stack = calloc(++idx_stack_alloc, sizeof(*idx_stack));
120  idx_stack[idx_stack_ptr] = idx;
121 
122  sprintf(fn2, "%s.crai", fn);
123  if (!(fp = hopen(fn2, "r"))) {
124  perror(fn2);
125  free(idx_stack);
126  return -1;
127  }
128 
129  // Load the file into memory
130  while ((len = hread(fp, buf, 65536)) > 0)
131  kputsn(buf, len, &kstr);
132  if (len < 0 || kstr.l < 2) {
133  if (kstr.s)
134  free(kstr.s);
135  free(idx_stack);
136  return -1;
137  }
138 
139  if (hclose(fp)) {
140  if (kstr.s)
141  free(kstr.s);
142  free(idx_stack);
143  return -1;
144  }
145 
146 
147  // Uncompress if required
148  if (kstr.s[0] == 31 && (uc)kstr.s[1] == 139) {
149  size_t l;
150  char *s = zlib_mem_inflate(kstr.s, kstr.l, &l);
151  free(kstr.s);
152  if (!s) {
153  free(idx_stack);
154  return -1;
155  }
156  kstr.s = s;
157  kstr.l = l;
158  }
159 
160 
161  // Parse it line at a time
162  do {
163  int nchars;
164  char *line = &kstr.s[pos];
165 
166  /* 1.1 layout */
167  if (sscanf(line, "%d\t%d\t%d\t%"PRId64"\t%d\t%d%n",
168  &e.refid,
169  &e.start,
170  &e.end,
171  &e.offset,
172  &e.slice,
173  &e.len,
174  &nchars) != 6) {
175  free(kstr.s);
176  free(idx_stack);
177  return -1;
178  }
179 
180  e.end += e.start-1;
181  //printf("%d/%d..%d\n", e.refid, e.start, e.end);
182 
183  if (e.refid < -1) {
184  free(kstr.s);
185  free(idx_stack);
186  fprintf(stderr, "Malformed index file, refid %d\n", e.refid);
187  return -1;
188  }
189 
190  if (e.refid != idx->refid) {
191  if (fd->index_sz < e.refid+2) {
192  size_t index_end = fd->index_sz * sizeof(*fd->index);
193  fd->index_sz = e.refid+2;
194  fd->index = realloc(fd->index,
195  fd->index_sz * sizeof(*fd->index));
196  memset(((char *)fd->index) + index_end, 0,
197  fd->index_sz * sizeof(*fd->index) - index_end);
198  }
199  idx = &fd->index[e.refid+1];
200  idx->refid = e.refid;
201  idx->start = INT_MIN;
202  idx->end = INT_MAX;
203  idx->nslice = idx->nalloc = 0;
204  idx->e = NULL;
205  idx_stack[(idx_stack_ptr = 0)] = idx;
206  }
207 
208  while (!(e.start >= idx->start && e.end <= idx->end)) {
209  idx = idx_stack[--idx_stack_ptr];
210  }
211 
212  // Now contains, so append
213  if (idx->nslice+1 >= idx->nalloc) {
214  idx->nalloc = idx->nalloc ? idx->nalloc*2 : 16;
215  idx->e = realloc(idx->e, idx->nalloc * sizeof(*idx->e));
216  }
217 
218  e.nalloc = e.nslice = 0; e.e = NULL;
219  *(ep = &idx->e[idx->nslice++]) = e;
220  idx = ep;
221 
222  if (++idx_stack_ptr >= idx_stack_alloc) {
223  idx_stack_alloc *= 2;
224  idx_stack = realloc(idx_stack, idx_stack_alloc*sizeof(*idx_stack));
225  }
226  idx_stack[idx_stack_ptr] = idx;
227 
228  pos += nchars;
229  while (pos < kstr.l && kstr.s[pos] != '\n')
230  pos++;
231  pos++;
232  } while (pos < kstr.l);
233 
234  free(idx_stack);
235  free(kstr.s);
236 
237  // dump_index(fd);
238 
239  return 0;
240 }
241 
242 static void cram_index_free_recurse(cram_index *e) {
243  if (e->e) {
244  int i;
245  for (i = 0; i < e->nslice; i++) {
246  cram_index_free_recurse(&e->e[i]);
247  }
248  free(e->e);
249  }
250 }
251 
253  int i;
254 
255  if (!fd->index)
256  return;
257 
258  for (i = 0; i < fd->index_sz; i++) {
259  cram_index_free_recurse(&fd->index[i]);
260  }
261  free(fd->index);
262 
263  fd->index = NULL;
264 }
265 
266 /*
267  * Searches the index for the first slice overlapping a reference ID
268  * and position, or one immediately preceeding it if none is found in
269  * the index to overlap this position. (Our index may have missing
270  * entries, but we require at least one per reference.)
271  *
272  * If the index finds multiple slices overlapping this position we
273  * return the first one only. Subsequent calls should specifying
274  * "from" as the last slice we checked to find the next one. Otherwise
275  * set "from" to be NULL to find the first one.
276  *
277  * Returns the cram_index pointer on sucess
278  * NULL on failure
279  */
280 cram_index *cram_index_query(cram_fd *fd, int refid, int pos,
281  cram_index *from) {
282  int i, j, k;
283  cram_index *e;
284 
285  if (refid+1 < 0 || refid+1 >= fd->index_sz)
286  return NULL;
287 
288  i = 0, j = fd->index[refid+1].nslice-1;
289 
290  if (!from)
291  from = &fd->index[refid+1];
292 
293  for (k = j/2; k != i; k = (j-i)/2 + i) {
294  if (from->e[k].refid > refid) {
295  j = k;
296  continue;
297  }
298 
299  if (from->e[k].refid < refid) {
300  i = k;
301  continue;
302  }
303 
304  if (from->e[k].start >= pos) {
305  j = k;
306  continue;
307  }
308 
309  if (from->e[k].start < pos) {
310  i = k;
311  continue;
312  }
313  }
314 
315  /* The above found *a* bin overlapping, but not necessarily the first */
316  while (i > 0 && from->e[i-1].end >= pos)
317  i--;
318 
319  /* Special case for matching a start pos */
320  if (i+1 < from->nslice &&
321  from->e[i+1].start == pos &&
322  from->e[i+1].refid == refid)
323  i++;
324 
325  e = &from->e[i];
326 
327  return e;
328 }
329 
330 
331 /*
332  * Skips to a container overlapping the start coordinate listed in
333  * cram_range.
334  *
335  * In theory we call cram_index_query multiple times, once per slice
336  * overlapping the range. However slices may be absent from the index
337  * which makes this problematic. Instead we find the left-most slice
338  * and then read from then on, skipping decoding of slices and/or
339  * whole containers when they don't overlap the specified cram_range.
340  *
341  * Returns 0 on success
342  * -1 on failure
343  */
345  cram_index *e;
346 
347  // Ideally use an index, so see if we have one.
348  if ((e = cram_index_query(fd, r->refid, r->start, NULL))) {
349  if (0 != cram_seek(fd, e->offset, SEEK_SET))
350  if (0 != cram_seek(fd, e->offset - fd->first_container, SEEK_CUR))
351  return -1;
352  } else {
353  fprintf(stderr, "Unknown reference ID. Missing from index?\n");
354  return -1;
355  }
356 
357  if (fd->ctr) {
359  fd->ctr = NULL;
360  }
361 
362  return 0;
363 }
364 
365 
366 /*
367  * A specialised form of cram_index_build (below) that deals with slices
368  * having multiple references in this (ref_id -2). In this scenario we
369  * decode the slice to look at the RI data series instead.
370  *
371  * Returns 0 on success
372  * -1 on failure
373  */
374 static int cram_index_build_multiref(cram_fd *fd,
375  cram_container *c,
376  cram_slice *s,
377  zfp *fp,
378  off_t cpos,
379  int32_t landmark,
380  int sz) {
381  int i, ref = -2, ref_start = 0, ref_end;
382  char buf[1024];
383 
384  if (0 != cram_decode_slice(fd, c, s, fd->header))
385  return -1;
386 
387  ref_end = INT_MIN;
388  for (i = 0; i < s->hdr->num_records; i++) {
389  if (s->crecs[i].ref_id == ref) {
390  if (ref_end < s->crecs[i].aend)
391  ref_end = s->crecs[i].aend;
392  continue;
393  }
394 
395  if (ref != -2) {
396  sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n",
397  ref, ref_start, ref_end - ref_start + 1,
398  (int64_t)cpos, landmark, sz);
399  zfputs(buf, fp);
400  }
401 
402  ref = s->crecs[i].ref_id;
403  ref_start = s->crecs[i].apos;
404  ref_end = INT_MIN;
405  }
406 
407  if (ref != -2) {
408  sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n",
409  ref, ref_start, ref_end - ref_start + 1,
410  (int64_t)cpos, landmark, sz);
411  zfputs(buf, fp);
412  }
413 
414  return 0;
415 }
416 
417 /*
418  * Builds an index file.
419  *
420  * fd is a newly opened cram file that we wish to index.
421  * fn_base is the filename of the associated CRAM file. Internally we
422  * add ".crai" to this to get the index filename.
423  *
424  * Returns 0 on success
425  * -1 on failure
426  */
427 int cram_index_build(cram_fd *fd, const char *fn_base) {
428  cram_container *c;
429  off_t cpos, spos, hpos;
430  zfp *fp;
431  char fn_idx[PATH_MAX];
432 
433  if (strlen(fn_base) > PATH_MAX-6)
434  return -1;
435 
436  sprintf(fn_idx, "%s.crai", fn_base);
437  if (!(fp = zfopen(fn_idx, "wz"))) {
438  perror(fn_idx);
439  return -1;
440  }
441 
442  cpos = htell(fd->fp);
443  while ((c = cram_read_container(fd))) {
444  int j;
445 
446  if (fd->err) {
447  perror("Cram container read");
448  return 1;
449  }
450 
451  hpos = htell(fd->fp);
452 
453  if (!(c->comp_hdr_block = cram_read_block(fd)))
454  return 1;
456 
458  if (!c->comp_hdr)
459  return -1;
460 
461  // 2.0 format
462  for (j = 0; j < c->num_landmarks; j++) {
463  char buf[1024];
464  cram_slice *s;
465  int sz;
466 
467  spos = htell(fd->fp);
468  assert(spos - cpos - c->offset == c->landmark[j]);
469 
470  if (!(s = cram_read_slice(fd))) {
471  zfclose(fp);
472  return -1;
473  }
474 
475  sz = (int)(htell(fd->fp) - spos);
476 
477  if (s->hdr->ref_seq_id == -2) {
478  cram_index_build_multiref(fd, c, s, fp,
479  cpos, c->landmark[j], sz);
480  } else {
481  sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n",
482  s->hdr->ref_seq_id, s->hdr->ref_seq_start,
483  s->hdr->ref_seq_span, (int64_t)cpos,
484  c->landmark[j], sz);
485  zfputs(buf, fp);
486  }
487 
488  cram_free_slice(s);
489  }
490 
491  cpos = htell(fd->fp);
492  assert(cpos == hpos + c->length);
493 
495  }
496  if (fd->err) {
497  zfclose(fp);
498  return -1;
499  }
500 
501 
502  return zfclose(fp);
503 }