NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
cram_io.c
Go to the documentation of this file.
1 /*
2 Copyright (c) 2012-2014 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4 
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7 
8  1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10 
11  2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14 
15  3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18 
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30 
31 /*
32  * CRAM I/O primitives.
33  *
34  * - ITF8 encoding and decoding.
35  * - Block based I/O
36  * - Zlib inflating and deflating (memory)
37  * - CRAM basic data structure reading and writing
38  * - File opening / closing
39  * - Reference sequence handling
40  */
41 
42 /*
43  * TODO: BLOCK_GROW, BLOCK_RESIZE, BLOCK_APPEND and itf8_put_blk all need
44  * a way to return errors for when malloc fails.
45  */
46 
47 #ifdef HAVE_CONFIG_H
48 #include "io_lib_config.h"
49 #endif
50 
51 #include <stdio.h>
52 #include <errno.h>
53 #include <assert.h>
54 #include <stdlib.h>
55 #include <string.h>
56 #include <zlib.h>
57 #ifdef HAVE_LIBBZ2
58 #include <bzlib.h>
59 #endif
60 #include <sys/types.h>
61 #include <sys/stat.h>
62 #include <math.h>
63 #include <ctype.h>
64 
65 #include "cram/cram.h"
66 #include "cram/os.h"
67 #include "cram/md5.h"
68 #include "cram/open_trace_file.h"
69 
70 //#define REF_DEBUG
71 
72 #ifdef REF_DEBUG
73 #include <sys/syscall.h>
74 #define gettid() (int)syscall(SYS_gettid)
75 
76 #define RP(...) fprintf (stderr, __VA_ARGS__)
77 #else
78 #define RP(...)
79 #endif
80 
81 #ifdef SAMTOOLS
82 #include "htslib/hfile.h"
83 #define paranoid_hclose(fp) (hclose(fp))
84 #else
85 #define hclose_abruptly(fp) (fclose(fp))
86 #define hflush(fp) (fflush(fp))
87 #define hgetc(fp) (getc(fp))
88 #define hputc(c, fp) (putc((c), (fp)))
89 #define hread(fp, buffer, nbytes) (fread((buffer), 1, (nbytes), (fp)))
90 #define hseek(fp, offset, whence) (fseeko((fp), (offset), (whence)))
91 #define hwrite(fp, buffer, nbytes) (fwrite((buffer), 1, (nbytes), (fp)))
92 #define paranoid_hclose(fp) (paranoid_fclose(fp))
93 #endif
94 
95 /* ----------------------------------------------------------------------
96  * ITF8 encoding and decoding.
97  *
98 * Also see the itf8_get and itf8_put macros in cram_io.h
99  */
100 
101 /*
102  * Reads an integer in ITF-8 encoding from 'cp' and stores it in
103  * *val.
104  *
105  * Returns the number of bytes read on success
106  * -1 on failure
107  */
108 int itf8_decode(cram_fd *fd, int32_t *val_p) {
109  static int nbytes[16] = {
110  0,0,0,0, 0,0,0,0, // 0000xxxx - 0111xxxx
111  1,1,1,1, // 1000xxxx - 1011xxxx
112  2,2, // 1100xxxx - 1101xxxx
113  3, // 1110xxxx
114  4, // 1111xxxx
115  };
116 
117  static int nbits[16] = {
118  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx
119  0x3f, 0x3f, 0x3f, 0x3f, // 1000xxxx - 1011xxxx
120  0x1f, 0x1f, // 1100xxxx - 1101xxxx
121  0x0f, // 1110xxxx
122  0x0f, // 1111xxxx
123  };
124 
125  int32_t val = hgetc(fd->fp);
126  if (val == -1)
127  return -1;
128 
129  int i = nbytes[val>>4];
130  val &= nbits[val>>4];
131 
132  switch(i) {
133  case 0:
134  *val_p = val;
135  return 1;
136 
137  case 1:
138  val = (val<<8) | (unsigned char)hgetc(fd->fp);
139  *val_p = val;
140  return 2;
141 
142  case 2:
143  val = (val<<8) | (unsigned char)hgetc(fd->fp);
144  val = (val<<8) | (unsigned char)hgetc(fd->fp);
145  *val_p = val;
146  return 3;
147 
148  case 3:
149  val = (val<<8) | (unsigned char)hgetc(fd->fp);
150  val = (val<<8) | (unsigned char)hgetc(fd->fp);
151  val = (val<<8) | (unsigned char)hgetc(fd->fp);
152  *val_p = val;
153  return 4;
154 
155  case 4: // really 3.5 more, why make it different?
156  val = (val<<8) | (unsigned char)hgetc(fd->fp);
157  val = (val<<8) | (unsigned char)hgetc(fd->fp);
158  val = (val<<8) | (unsigned char)hgetc(fd->fp);
159  val = (val<<4) | (((unsigned char)hgetc(fd->fp)) & 0x0f);
160  *val_p = val;
161  }
162 
163  return 5;
164 }
165 
166 /*
167  * Encodes and writes a single integer in ITF-8 format.
168  * Returns 0 on success
169  * -1 on failure
170  */
171 int itf8_encode(cram_fd *fd, int32_t val) {
172  char buf[5];
173  int len = itf8_put(buf, val);
174  return hwrite(fd->fp, buf, len) == len ? 0 : -1;
175 }
176 
177 #ifndef ITF8_MACROS
178 /*
179  * As above, but decoding from memory
180  */
181 int itf8_get(char *cp, int32_t *val_p) {
182  unsigned char *up = (unsigned char *)cp;
183 
184  if (up[0] < 0x80) {
185  *val_p = up[0];
186  return 1;
187  } else if (up[0] < 0xc0) {
188  *val_p = ((up[0] <<8) | up[1]) & 0x3fff;
189  return 2;
190  } else if (up[0] < 0xe0) {
191  *val_p = ((up[0]<<16) | (up[1]<< 8) | up[2]) & 0x1fffff;
192  return 3;
193  } else if (up[0] < 0xf0) {
194  *val_p = ((up[0]<<24) | (up[1]<<16) | (up[2]<<8) | up[3]) & 0x0fffffff;
195  return 4;
196  } else {
197  *val_p = ((up[0] & 0x0f)<<28) | (up[1]<<20) | (up[2]<<12) | (up[3]<<4) | (up[4] & 0x0f);
198  return 5;
199  }
200 }
201 
202 /*
203  * Stores a value to memory in ITF-8 format.
204  *
205  * Returns the number of bytes required to store the number.
206  * This is a maximum of 5 bytes.
207  */
208 int itf8_put(char *cp, int32_t val) {
209  if (!(val & ~0x00000007f)) { // 1 byte
210  *cp = val;
211  return 1;
212  } else if (!(val & ~0x00003fff)) { // 2 byte
213  *cp++ = (val >> 8 ) | 0x80;
214  *cp = val & 0xff;
215  return 2;
216  } else if (!(val & ~0x01fffff)) { // 3 byte
217  *cp++ = (val >> 16) | 0xc0;
218  *cp++ = (val >> 8 ) & 0xff;
219  *cp = val & 0xff;
220  return 3;
221  } else if (!(val & ~0x0fffffff)) { // 4 byte
222  *cp++ = (val >> 24) | 0xe0;
223  *cp++ = (val >> 16) & 0xff;
224  *cp++ = (val >> 8 ) & 0xff;
225  *cp = val & 0xff;
226  return 4;
227  } else { // 5 byte
228  *cp++ = 0xf0 | ((val>>28) & 0xff);
229  *cp++ = (val >> 20) & 0xff;
230  *cp++ = (val >> 12) & 0xff;
231  *cp++ = (val >> 4 ) & 0xff;
232  *cp = val & 0x0f;
233  return 5;
234  }
235 }
236 #endif
237 
238 /* 64-bit itf8 variant */
239 int ltf8_put(char *cp, int64_t val) {
240  if (!(val & ~((1LL<<7)-1))) {
241  *cp = val;
242  return 1;
243  } else if (!(val & ~((1LL<<(6+8))-1))) {
244  *cp++ = (val >> 8 ) | 0x80;
245  *cp = val & 0xff;
246  return 2;
247  } else if (!(val & ~((1LL<<(5+2*8))-1))) {
248  *cp++ = (val >> 16) | 0xc0;
249  *cp++ = (val >> 8 ) & 0xff;
250  *cp = val & 0xff;
251  return 3;
252  } else if (!(val & ~((1LL<<(4+3*8))-1))) {
253  *cp++ = (val >> 24) | 0xe0;
254  *cp++ = (val >> 16) & 0xff;
255  *cp++ = (val >> 8 ) & 0xff;
256  *cp = val & 0xff;
257  return 4;
258  } else if (!(val & ~((1LL<<(3+4*8))-1))) {
259  *cp++ = (val >> 32) | 0xf0;
260  *cp++ = (val >> 24) & 0xff;
261  *cp++ = (val >> 16) & 0xff;
262  *cp++ = (val >> 8 ) & 0xff;
263  *cp = val & 0xff;
264  return 5;
265  } else if (!(val & ~((1LL<<(2+5*8))-1))) {
266  *cp++ = (val >> 40) | 0xf8;
267  *cp++ = (val >> 32) & 0xff;
268  *cp++ = (val >> 24) & 0xff;
269  *cp++ = (val >> 16) & 0xff;
270  *cp++ = (val >> 8 ) & 0xff;
271  *cp = val & 0xff;
272  return 6;
273  } else if (!(val & ~((1LL<<(1+6*8))-1))) {
274  *cp++ = (val >> 48) | 0xfc;
275  *cp++ = (val >> 40) & 0xff;
276  *cp++ = (val >> 32) & 0xff;
277  *cp++ = (val >> 24) & 0xff;
278  *cp++ = (val >> 16) & 0xff;
279  *cp++ = (val >> 8 ) & 0xff;
280  *cp = val & 0xff;
281  return 7;
282  } else if (!(val & ~((1LL<<(7*8))-1))) {
283  *cp++ = (val >> 56) | 0xfe;
284  *cp++ = (val >> 48) & 0xff;
285  *cp++ = (val >> 40) & 0xff;
286  *cp++ = (val >> 32) & 0xff;
287  *cp++ = (val >> 24) & 0xff;
288  *cp++ = (val >> 16) & 0xff;
289  *cp++ = (val >> 8 ) & 0xff;
290  *cp = val & 0xff;
291  return 8;
292  } else {
293  *cp++ = 0xff;
294  *cp++ = (val >> 56) & 0xff;
295  *cp++ = (val >> 48) & 0xff;
296  *cp++ = (val >> 40) & 0xff;
297  *cp++ = (val >> 32) & 0xff;
298  *cp++ = (val >> 24) & 0xff;
299  *cp++ = (val >> 16) & 0xff;
300  *cp++ = (val >> 8 ) & 0xff;
301  *cp = val & 0xff;
302  return 9;
303  }
304 }
305 
306 int ltf8_get(char *cp, int64_t *val_p) {
307  unsigned char *up = (unsigned char *)cp;
308 
309  if (up[0] < 0x80) {
310  *val_p = up[0];
311  return 1;
312  } else if (up[0] < 0xc0) {
313  *val_p = (((uint64_t)up[0]<< 8) |
314  (uint64_t)up[1]) & (((1LL<<(6+8)))-1);
315  return 2;
316  } else if (up[0] < 0xe0) {
317  *val_p = (((uint64_t)up[0]<<16) |
318  ((uint64_t)up[1]<< 8) |
319  (uint64_t)up[2]) & ((1LL<<(5+2*8))-1);
320  return 3;
321  } else if (up[0] < 0xf0) {
322  *val_p = (((uint64_t)up[0]<<24) |
323  ((uint64_t)up[1]<<16) |
324  ((uint64_t)up[2]<< 8) |
325  (uint64_t)up[3]) & ((1LL<<(4+3*8))-1);
326  return 4;
327  } else if (up[0] < 0xf8) {
328  *val_p = (((uint64_t)up[0]<<32) |
329  ((uint64_t)up[1]<<24) |
330  ((uint64_t)up[2]<<16) |
331  ((uint64_t)up[3]<< 8) |
332  (uint64_t)up[4]) & ((1LL<<(3+4*8))-1);
333  return 5;
334  } else if (up[0] < 0xfc) {
335  *val_p = (((uint64_t)up[0]<<40) |
336  ((uint64_t)up[1]<<32) |
337  ((uint64_t)up[2]<<24) |
338  ((uint64_t)up[3]<<16) |
339  ((uint64_t)up[4]<< 8) |
340  (uint64_t)up[5]) & ((1LL<<(2+5*8))-1);
341  return 6;
342  } else if (up[0] < 0xfe) {
343  *val_p = (((uint64_t)up[0]<<48) |
344  ((uint64_t)up[1]<<40) |
345  ((uint64_t)up[2]<<32) |
346  ((uint64_t)up[3]<<24) |
347  ((uint64_t)up[4]<<16) |
348  ((uint64_t)up[5]<< 8) |
349  (uint64_t)up[6]) & ((1LL<<(1+6*8))-1);
350  return 7;
351  } else if (up[0] < 0xff) {
352  *val_p = (((uint64_t)up[1]<<48) |
353  ((uint64_t)up[2]<<40) |
354  ((uint64_t)up[3]<<32) |
355  ((uint64_t)up[4]<<24) |
356  ((uint64_t)up[5]<<16) |
357  ((uint64_t)up[6]<< 8) |
358  (uint64_t)up[7]) & ((1LL<<(7*8))-1);
359  return 8;
360  } else {
361  *val_p = (((uint64_t)up[1]<<56) |
362  ((uint64_t)up[2]<<48) |
363  ((uint64_t)up[3]<<40) |
364  ((uint64_t)up[4]<<32) |
365  ((uint64_t)up[5]<<24) |
366  ((uint64_t)up[6]<<16) |
367  ((uint64_t)up[7]<< 8) |
368  (uint64_t)up[8]);
369  return 9;
370  }
371 }
372 
373 int ltf8_decode(cram_fd *fd, int64_t *val_p) {
374  int c = hgetc(fd->fp);
375  int64_t val = (unsigned char)c;
376  if (c == -1)
377  return -1;
378 
379  if (val < 0x80) {
380  *val_p = val;
381  return 1;
382 
383  } else if (val < 0xc0) {
384  val = (val<<8) | (unsigned char)hgetc(fd->fp);
385  *val_p = val & (((1LL<<(6+8)))-1);
386  return 2;
387 
388  } else if (val < 0xe0) {
389  val = (val<<8) | (unsigned char)hgetc(fd->fp);
390  val = (val<<8) | (unsigned char)hgetc(fd->fp);
391  *val_p = val & ((1LL<<(5+2*8))-1);
392  return 3;
393 
394  } else if (val < 0xf0) {
395  val = (val<<8) | (unsigned char)hgetc(fd->fp);
396  val = (val<<8) | (unsigned char)hgetc(fd->fp);
397  val = (val<<8) | (unsigned char)hgetc(fd->fp);
398  *val_p = val & ((1LL<<(4+3*8))-1);
399  return 4;
400 
401  } else if (val < 0xf8) {
402  val = (val<<8) | (unsigned char)hgetc(fd->fp);
403  val = (val<<8) | (unsigned char)hgetc(fd->fp);
404  val = (val<<8) | (unsigned char)hgetc(fd->fp);
405  val = (val<<8) | (unsigned char)hgetc(fd->fp);
406  *val_p = val & ((1LL<<(3+4*8))-1);
407  return 5;
408 
409  } else if (val < 0xfc) {
410  val = (val<<8) | (unsigned char)hgetc(fd->fp);
411  val = (val<<8) | (unsigned char)hgetc(fd->fp);
412  val = (val<<8) | (unsigned char)hgetc(fd->fp);
413  val = (val<<8) | (unsigned char)hgetc(fd->fp);
414  val = (val<<8) | (unsigned char)hgetc(fd->fp);
415  *val_p = val & ((1LL<<(2+5*8))-1);
416  return 6;
417 
418  } else if (val < 0xfe) {
419  val = (val<<8) | (unsigned char)hgetc(fd->fp);
420  val = (val<<8) | (unsigned char)hgetc(fd->fp);
421  val = (val<<8) | (unsigned char)hgetc(fd->fp);
422  val = (val<<8) | (unsigned char)hgetc(fd->fp);
423  val = (val<<8) | (unsigned char)hgetc(fd->fp);
424  val = (val<<8) | (unsigned char)hgetc(fd->fp);
425  *val_p = val & ((1LL<<(1+6*8))-1);
426  return 7;
427 
428  } else if (val < 0xff) {
429  val = (val<<8) | (unsigned char)hgetc(fd->fp);
430  val = (val<<8) | (unsigned char)hgetc(fd->fp);
431  val = (val<<8) | (unsigned char)hgetc(fd->fp);
432  val = (val<<8) | (unsigned char)hgetc(fd->fp);
433  val = (val<<8) | (unsigned char)hgetc(fd->fp);
434  val = (val<<8) | (unsigned char)hgetc(fd->fp);
435  val = (val<<8) | (unsigned char)hgetc(fd->fp);
436  *val_p = val & ((1LL<<(7*8))-1);
437  return 8;
438 
439  } else {
440  val = (val<<8) | (unsigned char)hgetc(fd->fp);
441  val = (val<<8) | (unsigned char)hgetc(fd->fp);
442  val = (val<<8) | (unsigned char)hgetc(fd->fp);
443  val = (val<<8) | (unsigned char)hgetc(fd->fp);
444  val = (val<<8) | (unsigned char)hgetc(fd->fp);
445  val = (val<<8) | (unsigned char)hgetc(fd->fp);
446  val = (val<<8) | (unsigned char)hgetc(fd->fp);
447  val = (val<<8) | (unsigned char)hgetc(fd->fp);
448  *val_p = val;
449  }
450 
451  return 9;
452 }
453 
454 /*
455  * Pushes a value in ITF8 format onto the end of a block.
456  * This shouldn't be used for high-volume data as it is not the fastest
457  * method.
458  *
459  * Returns the number of bytes written
460  */
461 int itf8_put_blk(cram_block *blk, int val) {
462  char buf[5];
463  int sz;
464 
465  sz = itf8_put(buf, val);
466  BLOCK_APPEND(blk, buf, sz);
467  return sz;
468 }
469 
470 /*
471  * Decodes a 32-bit little endian value from fd and stores in val.
472  *
473  * Returns the number of bytes read on success
474  * -1 on failure
475  */
476 int int32_decode(cram_fd *fd, int32_t *val) {
477  int32_t i;
478  if (4 != hread(fd->fp, &i, 4))
479  return -1;
480 
481  *val = le_int4(i);
482  return 4;
483 }
484 
485 /*
486  * Encodes a 32-bit little endian value 'val' and writes to fd.
487  *
488  * Returns the number of bytes written on success
489  * -1 on failure
490  */
491 int int32_encode(cram_fd *fd, int32_t val) {
492  val = le_int4(val);
493  if (4 != hwrite(fd->fp, &val, 4))
494  return -1;
495 
496  return 4;
497 }
498 
499 /* As int32_decoded/encode, but from/to blocks instead of cram_fd */
500 int int32_get(cram_block *b, int32_t *val) {
501  if (b->uncomp_size - BLOCK_SIZE(b) < 4)
502  return -1;
503 
504  *val =
505  b->data[b->byte ] |
506  (b->data[b->byte+1] << 8) |
507  (b->data[b->byte+2] << 16) |
508  (b->data[b->byte+3] << 24);
509  BLOCK_SIZE(b) += 4;
510  return 4;
511 }
512 
513 /* As int32_decoded/encode, but from/to blocks instead of cram_fd */
515  unsigned char cp[4];
516  cp[0] = ( val & 0xff);
517  cp[1] = ((val>>8) & 0xff);
518  cp[2] = ((val>>16) & 0xff);
519  cp[3] = ((val>>24) & 0xff);
520 
521  BLOCK_APPEND(b, cp, 4);
522  return b->data ? 0 : -1;
523 }
524 
525 /* ----------------------------------------------------------------------
526  * zlib compression code - from Gap5's tg_iface_g.c
527  * They're static here as they're only used within the cram_compress_block
528  * and cram_uncompress_block functions, which are the external interface.
529  */
530 char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) {
531  z_stream s;
532  unsigned char *data = NULL; /* Uncompressed output */
533  int data_alloc = 0;
534  int err;
535 
536  /* Starting point at uncompressed size, and scale after that */
537  data = malloc(data_alloc = csize*1.2+100);
538  if (!data)
539  return NULL;
540 
541  /* Initialise zlib stream */
542  s.zalloc = Z_NULL; /* use default allocation functions */
543  s.zfree = Z_NULL;
544  s.opaque = Z_NULL;
545  s.next_in = (unsigned char *)cdata;
546  s.avail_in = csize;
547  s.total_in = 0;
548  s.next_out = data;
549  s.avail_out = data_alloc;
550  s.total_out = 0;
551 
552  //err = inflateInit(&s);
553  err = inflateInit2(&s, 15 + 32);
554  if (err != Z_OK) {
555  fprintf(stderr, "zlib inflateInit error: %s\n", s.msg);
556  free(data);
557  return NULL;
558  }
559 
560  /* Decode to 'data' array */
561  for (;s.avail_in;) {
562  unsigned char *data_tmp;
563  int alloc_inc;
564 
565  s.next_out = &data[s.total_out];
566  err = inflate(&s, Z_NO_FLUSH);
567  if (err == Z_STREAM_END)
568  break;
569 
570  if (err != Z_OK) {
571  fprintf(stderr, "zlib inflate error: %s\n", s.msg);
572  break;
573  }
574 
575  /* More to come, so realloc based on growth so far */
576  alloc_inc = (double)s.avail_in/s.total_in * s.total_out + 100;
577  data = realloc((data_tmp = data), data_alloc += alloc_inc);
578  if (!data) {
579  free(data_tmp);
580  return NULL;
581  }
582  s.avail_out += alloc_inc;
583  }
584  inflateEnd(&s);
585 
586  *size = s.total_out;
587  return (char *)data;
588 }
589 
590 static char *zlib_mem_deflate(char *data, size_t size, size_t *cdata_size,
591  int level, int strat) {
592  z_stream s;
593  unsigned char *cdata = NULL; /* Compressed output */
594  int cdata_alloc = 0;
595  int cdata_pos = 0;
596  int err;
597 
598  cdata = malloc(cdata_alloc = size*1.05+100);
599  if (!cdata)
600  return NULL;
601  cdata_pos = 0;
602 
603  /* Initialise zlib stream */
604  s.zalloc = Z_NULL; /* use default allocation functions */
605  s.zfree = Z_NULL;
606  s.opaque = Z_NULL;
607  s.next_in = (unsigned char *)data;
608  s.avail_in = size;
609  s.total_in = 0;
610  s.next_out = cdata;
611  s.avail_out = cdata_alloc;
612  s.total_out = 0;
613  s.data_type = Z_BINARY;
614 
615  err = deflateInit2(&s, level, Z_DEFLATED, 15|16, 9, strat);
616  if (err != Z_OK) {
617  fprintf(stderr, "zlib deflateInit2 error: %s\n", s.msg);
618  return NULL;
619  }
620 
621  /* Encode to 'cdata' array */
622  for (;s.avail_in;) {
623  s.next_out = &cdata[cdata_pos];
624  s.avail_out = cdata_alloc - cdata_pos;
625  if (cdata_alloc - cdata_pos <= 0) {
626  fprintf(stderr, "Deflate produced larger output than expected. Abort\n");
627  return NULL;
628  }
629  err = deflate(&s, Z_NO_FLUSH);
630  cdata_pos = cdata_alloc - s.avail_out;
631  if (err != Z_OK) {
632  fprintf(stderr, "zlib deflate error: %s\n", s.msg);
633  break;
634  }
635  }
636  if (deflate(&s, Z_FINISH) != Z_STREAM_END) {
637  fprintf(stderr, "zlib deflate error: %s\n", s.msg);
638  }
639  *cdata_size = s.total_out;
640 
641  if (deflateEnd(&s) != Z_OK) {
642  fprintf(stderr, "zlib deflate error: %s\n", s.msg);
643  }
644  return (char *)cdata;
645 }
646 
647 /* ----------------------------------------------------------------------
648  * CRAM blocks - the dynamically growable data block. We have code to
649  * create, update, (un)compress and read/write.
650  *
651  * These are derived from the deflate_interlaced.c blocks, but with the
652  * CRAM extension of content types and IDs.
653  */
654 
655 /*
656  * Allocates a new cram_block structure with a specified content_type and
657  * id.
658  *
659  * Returns block pointer on success
660  * NULL on failure
661  */
663  int content_id) {
664  cram_block *b = malloc(sizeof(*b));
665  if (!b)
666  return NULL;
667  b->method = b->orig_method = RAW;
668  b->content_type = content_type;
669  b->content_id = content_id;
670  b->comp_size = 0;
671  b->uncomp_size = 0;
672  b->data = NULL;
673  b->alloc = 0;
674  b->byte = 0;
675  b->bit = 7; // MSB
676 
677  return b;
678 }
679 
680 /*
681  * Reads a block from a cram file.
682  * Returns cram_block pointer on success.
683  * NULL on failure
684  */
686  cram_block *b = malloc(sizeof(*b));
687  if (!b)
688  return NULL;
689 
690  //fprintf(stderr, "Block at %d\n", (int)ftell(fd->fp));
691 
692  if (-1 == (b->method = hgetc(fd->fp))) { free(b); return NULL; }
693  if (-1 == (b->content_type= hgetc(fd->fp))) { free(b); return NULL; }
694  if (-1 == itf8_decode(fd, &b->content_id)) { free(b); return NULL; }
695  if (-1 == itf8_decode(fd, &b->comp_size)) { free(b); return NULL; }
696  if (-1 == itf8_decode(fd, &b->uncomp_size)) { free(b); return NULL; }
697 
698  // fprintf(stderr, " method %d, ctype %d, cid %d, csize %d, ucsize %d\n",
699  // b->method, b->content_type, b->content_id, b->comp_size, b->uncomp_size);
700 
701  if (b->method == RAW) {
702  b->alloc = b->uncomp_size;
703  if (!(b->data = malloc(b->uncomp_size))){ free(b); return NULL; }
704  if (b->uncomp_size != hread(fd->fp, b->data, b->uncomp_size)) {
705  free(b->data);
706  free(b);
707  return NULL;
708  }
709  } else {
710  b->alloc = b->comp_size;
711  if (!(b->data = malloc(b->comp_size))) { free(b); return NULL; }
712  if (b->comp_size != hread(fd->fp, b->data, b->comp_size)) {
713  free(b->data);
714  free(b);
715  return NULL;
716  }
717  }
718 
719  b->orig_method = b->method;
720  b->idx = 0;
721  b->byte = 0;
722  b->bit = 7; // MSB
723 
724  return b;
725 }
726 
727 /*
728  * Writes a CRAM block.
729  * Returns 0 on success
730  * -1 on failure
731  */
733  assert(b->method != RAW || (b->comp_size == b->uncomp_size));
734 
735  if (hputc(b->method, fd->fp) == EOF) return -1;
736  if (hputc(b->content_type, fd->fp) == EOF) return -1;
737  if (itf8_encode(fd, b->content_id) == -1) return -1;
738  if (itf8_encode(fd, b->comp_size) == -1) return -1;
739  if (itf8_encode(fd, b->uncomp_size) == -1) return -1;
740 
741  if (b->method == RAW) {
742  if (b->uncomp_size != hwrite(fd->fp, b->data, b->uncomp_size))
743  return -1;
744  } else {
745  if (b->comp_size != hwrite(fd->fp, b->data, b->comp_size))
746  return -1;
747  }
748 
749  return 0;
750 }
751 
752 /*
753  * Frees a CRAM block, deallocating internal data too.
754  */
756  if (!b)
757  return;
758  if (b->data)
759  free(b->data);
760  free(b);
761 }
762 
763 /*
764  * Uncompresses a CRAM block, if compressed.
765  */
767  char *uncomp;
768  size_t uncomp_size = 0;
769 
770  if (b->uncomp_size == 0) {
771  // blank block
772  b->method = RAW;
773  return 0;
774  }
775 
776  switch (b->method) {
777  case RAW:
778  b->uncomp_size = b->comp_size;
779  return 0;
780 
781  case GZIP:
782  uncomp = zlib_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
783  if (!uncomp)
784  return -1;
785  if ((int)uncomp_size != b->uncomp_size)
786  return -1;
787  free(b->data);
788  b->data = (unsigned char *)uncomp;
789  b->alloc = uncomp_size;
790  b->method = RAW;
791  break;
792 
793 #ifdef HAVE_LIBBZ2
794  case BZIP2: {
795  unsigned int usize = b->uncomp_size;
796  if (!(uncomp = malloc(usize)))
797  return -1;
798  if (BZ_OK != BZ2_bzBuffToBuffDecompress(uncomp, &usize,
799  (char *)b->data, b->comp_size,
800  0, 0)) {
801  free(uncomp);
802  return -1;
803  }
804  b->data = (unsigned char *)uncomp;
805  b->alloc = usize;
806  b->method = RAW;
807  b->uncomp_size = usize; // Just incase it differs
808  break;
809  }
810 #else
811  case BZIP2:
812  fprintf(stderr, "Bzip2 compression is not compiled into this "
813  "version.\nPlease rebuild and try again.\n");
814  return -1;
815 #endif
816 
817  case BM_ERROR:
818  default:
819  return -1;
820  }
821 
822  return 0;
823 }
824 
825 #ifdef HAVE_LIBBZ2
826 static int cram_compress_block_bzip2(cram_fd *fd, cram_block *b,
827  cram_metrics *metrics, int level) {
828  unsigned int comp_size = b->uncomp_size*1.01 + 600;
829  char *comp = malloc(comp_size);
830  char *data = (char *)b->data;
831 
832  if (!comp)
833  return -1;
834 
835  if (!data)
836  data = "";
837 
838  if (BZ_OK != BZ2_bzBuffToBuffCompress(comp, &comp_size,
839  data, b->uncomp_size,
840  level, 0, 30)) {
841  free(comp);
842  return -1;
843  }
844 
845  free(b->data);
846  b->data = (unsigned char *)comp;
847  b->method = BZIP2;
848  b->comp_size = comp_size;
849 
850  if (fd->verbose)
851  fprintf(stderr, "Compressed block ID %d from %d to %d\n",
852  b->content_id, b->uncomp_size, b->comp_size);
853 
854  return 0;
855 }
856 #endif
857 
858 /*
859  * Compresses a block using one of two different zlib strategies. If we only
860  * want one choice set strat2 to be -1.
861  *
862  * The logic here is that sometimes Z_RLE does a better job than Z_FILTERED
863  * or Z_DEFAULT_STRATEGY on quality data. If so, we'd rather use it as it is
864  * significantly faster.
865  */
867  int level, int strat,
868  int level2, int strat2) {
869  char *comp = NULL;
870  size_t comp_size = 0;
871 
872  if (level == 0) {
873  b->method = RAW;
874  b->comp_size = b->uncomp_size;
875  return 0;
876  }
877 
878  if (b->method != RAW) {
879  fprintf(stderr, "Attempt to compress an already compressed block.\n");
880  return 0;
881  }
882 
883 #ifdef HAVE_LIBBZ2
884  if (fd->use_bz2)
885  // metrics ignored for bzip2
886  return cram_compress_block_bzip2(fd, b, metrics, level);
887 #endif
888 
889  pthread_mutex_lock(&fd->metrics_lock);
890  if (strat2 >= 0)
891  if (fd->verbose > 1)
892  fprintf(stderr, "metrics trial %d, next_trial %d, m1 %d, m2 %d\n",
893  metrics->trial, metrics->next_trial,
894  metrics->m1, metrics->m2);
895 
896  if (strat2 >= 0 && (metrics->trial > 0 || --metrics->next_trial <= 0)) {
897  char *c1, *c2;
898  size_t s1, s2;
899 
900  if (metrics->next_trial == 0) {
901  metrics->next_trial = 100;
902  metrics->trial = 3;
903  metrics->m1 = metrics->m2 = 0;
904  }
905  pthread_mutex_unlock(&fd->metrics_lock);
906 
907  c1 = zlib_mem_deflate((char *)b->data, b->uncomp_size,
908  &s1, level, strat);
909  c2 = zlib_mem_deflate((char *)b->data, b->uncomp_size,
910  &s2, level2, strat2);
911  if (!c1 || !c2)
912  return -1;
913 
914  //fprintf(stderr, "1: %6d 2: %6d %5.1f\n", s1, s2, 100.0*s1/s2);
915 
916  pthread_mutex_lock(&fd->metrics_lock);
917  if (s1 < 0.98 * s2) { // 2nd one should be faster alternative
918  if (fd->verbose > 1)
919  fprintf(stderr, "M1 wins %d vs %d\n", (int)s1, (int)s2);
920  comp = c1; comp_size = s1;
921  free(c2);
922  metrics->m1++;
923  } else {
924  if (fd->verbose > 1)
925  fprintf(stderr, "M2 wins %d vs %d\n", (int)s1, (int)s2);
926  comp = c2; comp_size = s2;
927  free(c1);
928  metrics->m2++;
929  }
930  metrics->trial--;
931  pthread_mutex_unlock(&fd->metrics_lock);
932  } else if (strat2 >= 0) {
933  int xlevel = metrics->m1 > metrics->m2 ? level : level2;
934  int xstrat = metrics->m1 > metrics->m2 ? strat : strat2;
935  pthread_mutex_unlock(&fd->metrics_lock);
936  comp = zlib_mem_deflate((char *)b->data, b->uncomp_size, &comp_size,
937  xlevel, xstrat);
938  } else {
939  pthread_mutex_unlock(&fd->metrics_lock);
940  comp = zlib_mem_deflate((char *)b->data, b->uncomp_size, &comp_size,
941  level, strat);
942  }
943 
944  if (!comp)
945  return -1;
946 
947  free(b->data);
948  b->data = (unsigned char *)comp;
949  b->method = GZIP;
950  b->comp_size = comp_size;
951 
952  if (fd->verbose)
953  fprintf(stderr, "Compressed block ID %d from %d to %d\n",
954  b->content_id, b->uncomp_size, b->comp_size);
955 
956  return 0;
957 }
958 
960  cram_metrics *m = malloc(sizeof(*m));
961  if (!m)
962  return NULL;
963  m->m1 = m->m2 = 0;
964  m->trial = 2;
965  m->next_trial = 100;
966  return m;
967 }
968 
970  switch(m) {
971  case RAW: return "RAW";
972  case GZIP: return "GZIP";
973  case BZIP2: return "BZIP2";
974  case BM_ERROR: break;
975  }
976  return "?";
977 }
978 
980  switch (t) {
981  case FILE_HEADER: return "FILE_HEADER";
982  case COMPRESSION_HEADER: return "COMPRESSION_HEADER";
983  case MAPPED_SLICE: return "MAPPED_SLICE";
984  case UNMAPPED_SLICE: return "UNMAPPED_SLICE";
985  case EXTERNAL: return "EXTERNAL";
986  case CORE: return "CORE";
987  case CT_ERROR: break;
988  }
989  return "?";
990 }
991 
992 /*
993  * Extra error checking on fclose to really ensure data is written.
994  * Care needs to be taken to handle pipes vs real files.
995  *
996  * Returns 0 on success
997  * -1 on failure.
998  */
999 int paranoid_fclose(FILE *fp) {
1000  if (-1 == fflush(fp) && errno != EBADF) {
1001  fclose(fp);
1002  return -1;
1003  }
1004 
1005  errno = 0;
1006  if (-1 == fsync(fileno(fp))) {
1007  if (errno != EINVAL) { // eg pipe
1008  fclose(fp);
1009  return -1;
1010  }
1011  }
1012  return fclose(fp);
1013 }
1014 
1015 /* ----------------------------------------------------------------------
1016  * Reference sequence handling
1017  *
1018  * These revolve around the refs_t structure, which may potentially be
1019  * shared between multiple cram_fd.
1020  *
1021  * We start with refs_create() to allocate an empty refs_t and then
1022  * populate it with @SQ line data using refs_from_header(). This is done on
1023  * cram_open(). Also at start up we can call cram_load_reference() which
1024  * is used with "scramble -r foo.fa". This replaces the fd->refs with the
1025  * new one specified. In either case refs2id() is then called which
1026  * maps ref_entry names to @SQ ids (refs_t->ref_id[]).
1027  *
1028  * Later, possibly within a thread, we will want to know the actual ref
1029  * seq itself, obtained by calling cram_get_ref(). This may use the
1030  * UR: or M5: fields or the filename specified in the original
1031  * cram_load_reference() call.
1032  *
1033  * Given the potential for multi-threaded reference usage, we have
1034  * reference counting (sorry for the confusing double use of "ref") to
1035  * track the number of callers interested in any specific reference.
1036  */
1037 
1038 void refs_free(refs_t *r) {
1039  RP("refs_free()\n");
1040 
1041  if (--r->count > 0)
1042  return;
1043 
1044  if (!r)
1045  return;
1046 
1047  if (r->pool)
1049 
1050  if (r->h_meta) {
1051  khint_t k;
1052 
1053  for (k = kh_begin(r->h_meta); k != kh_end(r->h_meta); k++) {
1054  ref_entry *e;
1055 
1056  if (!kh_exist(r->h_meta, k))
1057  continue;
1058  if (!(e = kh_val(r->h_meta, k)))
1059  continue;
1060  if (e->seq)
1061  free(e->seq);
1062  free(e);
1063  }
1064 
1065  kh_destroy(refs, r->h_meta);
1066  }
1067 
1068  if (r->ref_id)
1069  free(r->ref_id);
1070 
1071  if (r->fp)
1072  fclose(r->fp);
1073 
1074  pthread_mutex_destroy(&r->lock);
1075 
1076  free(r);
1077 }
1078 
1079 static refs_t *refs_create(void) {
1080  refs_t *r = calloc(1, sizeof(*r));
1081 
1082  RP("refs_create()\n");
1083 
1084  if (!r)
1085  return NULL;
1086 
1087  if (!(r->pool = string_pool_create(8192)))
1088  goto err;
1089 
1090  r->ref_id = NULL; // see refs2id() to populate.
1091  r->count = 1;
1092  r->last = NULL;
1093  r->last_id = -1;
1094 
1095  if (!(r->h_meta = kh_init(refs)))
1096  goto err;
1097 
1098  pthread_mutex_init(&r->lock, NULL);
1099 
1100  return r;
1101 
1102  err:
1103  refs_free(r);
1104  return NULL;
1105 }
1106 
1107 /*
1108  * Loads a FAI file for a reference.fasta.
1109  * "is_err" indicates whether failure to load is worthy of emitting an
1110  * error message. In some cases (eg with embedded references) we
1111  * speculatively load, just incase, and silently ignore errors.
1112  *
1113  * Returns the refs_t struct on success (maybe newly allocated);
1114  * NULL on failure
1115  */
1116 static refs_t *refs_load_fai(refs_t *r_orig, char *fn, int is_err) {
1117  struct stat sb;
1118  FILE *fp = NULL;
1119  char fai_fn[PATH_MAX];
1120  char line[8192];
1121  refs_t *r = r_orig;
1122  size_t fn_l = strlen(fn);
1123 
1124  RP("refs_load_fai %s\n", fn);
1125 
1126  if (!r)
1127  if (!(r = refs_create()))
1128  goto err;
1129 
1130  /* Open reference, for later use */
1131  if (stat(fn, &sb) != 0) {
1132  if (is_err)
1133  perror(fn);
1134  goto err;
1135  }
1136 
1137  if (r->fp)
1138  fclose(r->fp);
1139  r->fp = NULL;
1140 
1141  if (!(r->fn = string_dup(r->pool, fn)))
1142  goto err;
1143 
1144  if (fn_l > 4 && strcmp(&fn[fn_l-4], ".fai") == 0)
1145  r->fn[fn_l-4] = 0;
1146 
1147  if (!(r->fp = fopen(r->fn, "r"))) {
1148  if (is_err)
1149  perror(fn);
1150  goto err;
1151  }
1152 
1153  /* Parse .fai file and load meta-data */
1154  sprintf(fai_fn, "%.*s.fai", PATH_MAX-5, r->fn);
1155 
1156  if (stat(fai_fn, &sb) != 0) {
1157  if (is_err)
1158  perror(fai_fn);
1159  goto err;
1160  }
1161  if (!(fp = fopen(fai_fn, "r"))) {
1162  if (is_err)
1163  perror(fai_fn);
1164  goto err;
1165  }
1166  while (fgets(line, 8192, fp) != NULL) {
1167  ref_entry *e = malloc(sizeof(*e));
1168  char *cp;
1169  int n;
1170  khint_t k;
1171 
1172  if (!e)
1173  return NULL;
1174 
1175  // id
1176  for (cp = line; *cp && !isspace(*cp); cp++)
1177  ;
1178  *cp++ = 0;
1179  e->name = string_dup(r->pool, line);
1180 
1181  // length
1182  while (*cp && isspace(*cp))
1183  cp++;
1184  e->length = strtoll(cp, &cp, 10);
1185 
1186  // offset
1187  while (*cp && isspace(*cp))
1188  cp++;
1189  e->offset = strtoll(cp, &cp, 10);
1190 
1191  // bases per line
1192  while (*cp && isspace(*cp))
1193  cp++;
1194  e->bases_per_line = strtol(cp, &cp, 10);
1195 
1196  // line length
1197  while (*cp && isspace(*cp))
1198  cp++;
1199  e->line_length = strtol(cp, &cp, 10);
1200 
1201  // filename
1202  e->fn = r->fn;
1203 
1204  e->count = 0;
1205  e->seq = NULL;
1206 
1207  k = kh_put(refs, r->h_meta, e->name, &n);
1208  if (-1 == n) {
1209  free(e);
1210  return NULL;
1211  }
1212 
1213  if (n) {
1214  kh_val(r->h_meta, k) = e;
1215  } else {
1216  ref_entry *re = kh_val(r->h_meta, k);
1217  if (re && (re->count != 0 || re->length != 0)) {
1218  /* Keep old */
1219  free(e);
1220  } else {
1221  /* Replace old */
1222  if (re)
1223  free(re);
1224  kh_val(r->h_meta, k) = e;
1225  }
1226  }
1227  }
1228 
1229  return r;
1230 
1231  err:
1232  if (fp)
1233  fclose(fp);
1234 
1235  if (!r_orig)
1236  refs_free(r);
1237 
1238  return NULL;
1239 }
1240 
1241 /*
1242  * Indexes references by the order they appear in a BAM file. This may not
1243  * necessarily be the same order they appear in the fasta reference file.
1244  *
1245  * Returns 0 on success
1246  * -1 on failure
1247  */
1248 int refs2id(refs_t *r, SAM_hdr *h) {
1249  int i;
1250 
1251  if (r->ref_id)
1252  free(r->ref_id);
1253  if (r->last)
1254  r->last = NULL;
1255 
1256  r->ref_id = calloc(h->nref, sizeof(*r->ref_id));
1257  if (!r->ref_id)
1258  return -1;
1259 
1260  r->nref = h->nref;
1261  for (i = 0; i < h->nref; i++) {
1262  khint_t k = kh_get(refs, r->h_meta, h->ref[i].name);
1263  if (k != kh_end(r->h_meta)) {
1264  r->ref_id[i] = kh_val(r->h_meta, k);
1265  } else {
1266  fprintf(stderr, "Unable to find ref name '%s'\n",
1267  h->ref[i].name);
1268  }
1269  }
1270 
1271  return 0;
1272 }
1273 
1274 /*
1275  * Generates refs_t entries based on @SQ lines in the header.
1276  * Returns 0 on success
1277  * -1 on failure
1278  */
1279 static int refs_from_header(refs_t *r, cram_fd *fd, SAM_hdr *h) {
1280  int i;
1281 
1282  if (!h || h->nref == 0)
1283  return 0;
1284 
1285  //fprintf(stderr, "refs_from_header for %p mode %c\n", fd, fd->mode);
1286 
1287  /* Existing refs are fine, as long as they're compatible with the hdr. */
1288  i = r->nref;
1289  if (r->nref < h->nref)
1290  r->nref = h->nref;
1291 
1292  if (!(r->ref_id = realloc(r->ref_id, r->nref * sizeof(*r->ref_id))))
1293  return -1;
1294 
1295  for (; i < r->nref; i++)
1296  r->ref_id[i] = NULL;
1297 
1298  /* Copy info from h->ref[i] over to r */
1299  for (i = 0; i < h->nref; i++) {
1300  SAM_hdr_type *ty;
1301  SAM_hdr_tag *tag;
1302  khint_t k;
1303  int n;
1304 
1305  if (r->ref_id[i] && 0 == strcmp(r->ref_id[i]->name, h->ref[i].name))
1306  continue;
1307 
1308  if (!(r->ref_id[i] = calloc(1, sizeof(ref_entry))))
1309  return -1;
1310 
1311  if (!h->ref[i].name)
1312  return -1;
1313 
1314  r->ref_id[i]->name = string_dup(r->pool, h->ref[i].name);
1315  r->ref_id[i]->length = 0; // marker for not yet loaded
1316 
1317  /* Initialise likely filename if known */
1318  if ((ty = sam_hdr_find(h, "SQ", "SN", h->ref[i].name))) {
1319  if ((tag = sam_hdr_find_key(h, ty, "M5", NULL))) {
1320  r->ref_id[i]->fn = string_dup(r->pool, tag->str+3);
1321  //fprintf(stderr, "Tagging @SQ %s / %s\n", r->ref_id[i]->name, r->ref_id[i]->fn);
1322  }
1323  }
1324 
1325  k = kh_put(refs, r->h_meta, r->ref_id[i]->name, &n);
1326  if (n <= 0) // already exists or error
1327  return -1;
1328  kh_val(r->h_meta, k) = r->ref_id[i];
1329  }
1330 
1331  return 0;
1332 }
1333 
1334 /*
1335  * Attaches a header to a cram_fd.
1336  *
1337  * This should be used when creating a new cram_fd for writing where
1338  * we have an SAM_hdr already constructed (eg from a file we've read
1339  * in).
1340  */
1342  fd->header = hdr;
1343  return refs_from_header(fd->refs, fd, hdr);
1344 }
1345 
1346 /*
1347  * Converts a directory and a filename into an expanded path, replacing %s
1348  * in directory with the filename and %[0-9]+s with portions of the filename
1349  * Any remaining parts of filename are added to the end with /%s.
1350  */
1351 void expand_cache_path(char *path, char *dir, char *fn) {
1352  char *cp;
1353 
1354  while ((cp = strchr(dir, '%'))) {
1355  strncpy(path, dir, cp-dir);
1356  path += cp-dir;
1357 
1358  if (*++cp == 's') {
1359  strcpy(path, fn);
1360  path += strlen(fn);
1361  fn += strlen(fn);
1362  cp++;
1363  } else if (*cp >= '0' && *cp <= '9') {
1364  char *endp;
1365  long l;
1366 
1367  l = strtol(cp, &endp, 10);
1368  l = MIN(l, strlen(fn));
1369  if (*endp == 's') {
1370  strncpy(path, fn, l);
1371  path += l;
1372  fn += l;
1373  *path = 0;
1374  cp = endp+1;
1375  } else {
1376  *path++ = '%';
1377  *path++ = *cp++;
1378  }
1379  } else {
1380  *path++ = '%';
1381  *path++ = *cp++;
1382  }
1383  dir = cp;
1384  }
1385  strcpy(path, dir);
1386  path += strlen(dir);
1387  if (*fn && path[-1] != '/')
1388  *path++ = '/';
1389  strcpy(path, fn);
1390 }
1391 
1392 /*
1393  * Make the directory containing path and any prefix directories.
1394  */
1395 void mkdir_prefix(char *path, int mode) {
1396  char *cp = strrchr(path, '/');
1397  if (!cp)
1398  return;
1399 
1400  *cp = 0;
1401  if (is_directory(path)) {
1402  *cp = '/';
1403  return;
1404  }
1405 
1406  if (mkdir(path, mode) == 0) {
1407  chmod(path, mode);
1408  *cp = '/';
1409  return;
1410  }
1411 
1412  mkdir_prefix(path, mode);
1413  mkdir(path, mode);
1414  chmod(path, mode);
1415  *cp = '/';
1416 }
1417 
1418 /*
1419  * Queries the M5 string from the header and attempts to populate the
1420  * reference from this using the REF_PATH environment.
1421  *
1422  * Returns 0 on sucess
1423  * -1 on failure
1424  */
1425 static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) {
1426  char *ref_path = getenv("REF_PATH");
1427  SAM_hdr_type *ty;
1428  SAM_hdr_tag *tag;
1429  char path[PATH_MAX], path_tmp[PATH_MAX];
1430  char *local_cache = getenv("REF_CACHE");
1431  mFILE *mf;
1432 
1433  if (fd->verbose)
1434  fprintf(stderr, "cram_populate_ref on fd %p, id %d\n", fd, id);
1435 
1436  if (!ref_path)
1437  ref_path = ".";
1438 
1439  if (!r->name)
1440  return -1;
1441 
1442  if (!(ty = sam_hdr_find(fd->header, "SQ", "SN", r->name)))
1443  return -1;
1444 
1445  if (!(tag = sam_hdr_find_key(fd->header, ty, "M5", NULL)))
1446  goto no_M5;
1447 
1448  if (fd->verbose)
1449  fprintf(stderr, "Querying ref %s\n", tag->str+3);
1450 
1451  /* Use cache if available */
1452  if (local_cache && *local_cache) {
1453  struct stat sb;
1454  FILE *fp;
1455 
1456  expand_cache_path(path, local_cache, tag->str+3);
1457 
1458  if (0 == stat(path, &sb) && (fp = fopen(path, "r"))) {
1459  r->length = sb.st_size;
1460  r->offset = r->line_length = r->bases_per_line = 0;
1461 
1462  r->fn = string_dup(fd->refs->pool, path);
1463 
1464  if (fd->refs->fp)
1465  fclose(fd->refs->fp);
1466  fd->refs->fp = fp;
1467  fd->refs->fn = r->fn;
1468 
1469  // Fall back to cram_get_ref() where it'll do the actual
1470  // reading of the file.
1471  return 0;
1472  }
1473  }
1474 
1475  /* Otherwise search */
1476  if ((mf = open_path_mfile(tag->str+3, ref_path, NULL))) {
1477  size_t sz;
1478  r->seq = mfsteal(mf, &sz);
1479  r->length = sz;
1480  } else {
1481  refs_t *refs;
1482  char *fn;
1483 
1484  no_M5:
1485  /* Failed to find in search path or M5 cache, see if @SQ UR: tag? */
1486  if (!(tag = sam_hdr_find_key(fd->header, ty, "UR", NULL)))
1487  return -1;
1488 
1489  fn = (strncmp(tag->str+3, "file:", 5) == 0)
1490  ? tag->str+8
1491  : tag->str+3;
1492 
1493  if (fd->refs->fp) {
1494  fclose(fd->refs->fp);
1495  fd->refs->fp = NULL;
1496  }
1497  if (!(refs = refs_load_fai(fd->refs, fn, 0)))
1498  return -1;
1499  fd->refs = refs;
1500  if (fd->refs->fp) {
1501  fclose(fd->refs->fp);
1502  fd->refs->fp = NULL;
1503  }
1504 
1505  if (!fd->refs->fn)
1506  return -1;
1507 
1508  if (-1 == refs2id(fd->refs, fd->header))
1509  return -1;
1510  if (!fd->refs->ref_id || !fd->refs->ref_id[id])
1511  return -1;
1512 
1513  // Local copy already, so fall back to cram_get_ref().
1514  return 0;
1515  }
1516 
1517  /* Populate the local disk cache if required */
1518  if (local_cache && *local_cache) {
1519  FILE *fp;
1520  int i;
1521 
1522  expand_cache_path(path, local_cache, tag->str+3);
1523  if (fd->verbose)
1524  fprintf(stderr, "Path='%s'\n", path);
1525  mkdir_prefix(path, 01777);
1526 
1527  i = 0;
1528  do {
1529  sprintf(path_tmp, "%s.tmp_%d", path, /*getpid(),*/ i);
1530  i++;
1531  fp = fopen(path_tmp, "wx");
1532  } while (fp == NULL && errno == EEXIST);
1533  if (!fp) {
1534  perror(path_tmp);
1535 
1536  // Not fatal - we have the data already so keep going.
1537  return 0;
1538  }
1539 
1540  if (r->length != fwrite(r->seq, 1, r->length, fp)) {
1541  perror(path);
1542  }
1543  if (-1 == paranoid_fclose(fp)) {
1544  unlink(path_tmp);
1545  } else {
1546  if (0 == chmod(path_tmp, 0444))
1547  rename(path_tmp, path);
1548  else
1549  unlink(path_tmp);
1550  }
1551  }
1552 
1553  return 0;
1554 }
1555 
1556 static void cram_ref_incr_locked(refs_t *r, int id) {
1557  RP("%d INC REF %d, %d %p\n", gettid(), id, (int)(id>=0?r->ref_id[id]->count+1:-999), id>=0?r->ref_id[id]->seq:(char *)1);
1558 
1559  if (id < 0 || !r->ref_id[id]->seq)
1560  return;
1561 
1562  if (r->last_id == id)
1563  r->last_id = -1;
1564 
1565  ++r->ref_id[id]->count;
1566 }
1567 
1568 void cram_ref_incr(refs_t *r, int id) {
1569  pthread_mutex_lock(&r->lock);
1570  cram_ref_incr_locked(r, id);
1571  pthread_mutex_unlock(&r->lock);
1572 }
1573 
1574 static void cram_ref_decr_locked(refs_t *r, int id) {
1575  RP("%d DEC REF %d, %d %p\n", gettid(), id, (int)(id>=0?r->ref_id[id]->count-1:-999), id>=0?r->ref_id[id]->seq:(char *)1);
1576 
1577  if (id < 0 || !r->ref_id[id]->seq) {
1578  assert(r->ref_id[id]->count >= 0);
1579  return;
1580  }
1581 
1582  if (--r->ref_id[id]->count <= 0) {
1583  assert(r->ref_id[id]->count == 0);
1584  if (r->last_id >= 0) {
1585  if (r->ref_id[r->last_id]->count <= 0 &&
1586  r->ref_id[r->last_id]->seq) {
1587  RP("%d FREE REF %d (%p)\n", gettid(),
1588  r->last_id, r->ref_id[r->last_id]->seq);
1589  free(r->ref_id[r->last_id]->seq);
1590  r->ref_id[r->last_id]->seq = NULL;
1591  r->ref_id[r->last_id]->length = 0;
1592  }
1593  r->last_id = -1;
1594  } else {
1595  r->last_id = id;
1596  }
1597  }
1598 }
1599 
1600 void cram_ref_decr(refs_t *r, int id) {
1601  pthread_mutex_lock(&r->lock);
1602  cram_ref_decr_locked(r, id);
1603  pthread_mutex_unlock(&r->lock);
1604 }
1605 
1606 /*
1607  * Used by cram_ref_load and cram_ref_get. The file handle will have
1608  * already been opened, so we can catch it. The ref_entry *e informs us
1609  * of whether this is a multi-line fasta file or a raw MD5 style file.
1610  * Either way we create a single contiguous sequence.
1611  *
1612  * Returns all or part of a reference sequence on success (malloced);
1613  * NULL on failure.
1614  */
1615 static char *load_ref_portion(FILE *fp, ref_entry *e, int start, int end) {
1616  off_t offset, len;
1617  char *seq;
1618 
1619  if (end < start)
1620  end = start;
1621 
1622  /*
1623  * Compute locations in file. This is trivial for the MD5 files, but
1624  * is still necessary for the fasta variants.
1625  */
1626  offset = e->line_length
1627  ? e->offset + (start-1)/e->bases_per_line * e->line_length +
1628  (start-1) % e->bases_per_line
1629  : start-1;
1630 
1631  len = (e->line_length
1632  ? e->offset + (end-1)/e->bases_per_line * e->line_length +
1633  (end-1) % e->bases_per_line
1634  : end-1) - offset + 1;
1635 
1636  if (0 != fseeko(fp, offset, SEEK_SET)) {
1637  perror("fseeko() on reference file");
1638  return NULL;
1639  }
1640 
1641  if (len == 0 || !(seq = malloc(len))) {
1642  return NULL;
1643  }
1644 
1645  if (len != fread(seq, 1, len, fp)) {
1646  perror("fread() on reference file");
1647  free(seq);
1648  return NULL;
1649  }
1650 
1651  /* Strip white-space if required. */
1652  if (len != end-start+1) {
1653  int i, j;
1654  char *cp = seq;
1655  char *cp_to;
1656 
1657  for (i = j = 0; i < len; i++) {
1658  if (cp[i] >= '!' && cp[i] <= '~')
1659  cp[j++] = cp[i] & ~0x20;
1660  }
1661  cp_to = cp+j;
1662 
1663  if (cp_to - seq != end-start+1) {
1664  fprintf(stderr, "Malformed reference file?\n");
1665  free(seq);
1666  return NULL;
1667  }
1668  } else {
1669  int i;
1670  for (i = 0; i < len; i++) {
1671  seq[i] = seq[i] & ~0x20; // uppercase in ASCII
1672  }
1673  }
1674 
1675  return seq;
1676 }
1677 
1678 /*
1679  * Load the entire reference 'id'.
1680  * This also increments the reference count by 1.
1681  *
1682  * Returns ref_entry on success;
1683  * NULL on failure
1684  */
1686  ref_entry *e = r->ref_id[id];
1687  int start = 1, end = e->length;
1688  char *seq;
1689 
1690  if (e->seq) {
1691  return e;
1692  }
1693 
1694  assert(e->count == 0);
1695 
1696  if (r->last) {
1697 #ifdef REF_DEBUG
1698  int idx = 0;
1699  for (idx = 0; idx < r->nref; idx++)
1700  if (r->last == r->ref_id[idx])
1701  break;
1702  RP("%d cram_ref_load DECR %d\n", gettid(), idx);
1703 #endif
1704  assert(r->last->count > 0);
1705  if (--r->last->count <= 0) {
1706  RP("%d FREE REF %d (%p)\n", gettid(), id, r->ref_id[id]->seq);
1707  if (r->last->seq) {
1708  free(r->last->seq);
1709  r->last->seq = NULL;
1710  }
1711  }
1712  }
1713 
1714  /* Open file if it's not already the current open reference */
1715  if (strcmp(r->fn, e->fn) || r->fp == NULL) {
1716  if (r->fp)
1717  fclose(r->fp);
1718  r->fn = e->fn;
1719  if (!(r->fp = fopen(r->fn, "r"))) {
1720  perror(r->fn);
1721  return NULL;
1722  }
1723  }
1724 
1725  RP("%d Loading ref %d (%d..%d)\n", gettid(), id, start, end);
1726 
1727  if (!(seq = load_ref_portion(r->fp, e, start, end))) {
1728  return NULL;
1729  }
1730 
1731  RP("%d Loaded ref %d (%d..%d) = %p\n", gettid(), id, start, end, seq);
1732 
1733  RP("%d INC REF %d, %d\n", gettid(), id, (int)(e->count+1));
1734  e->seq = seq;
1735  e->count++;
1736 
1737  /*
1738  * Also keep track of last used ref so incr/decr loops on the same
1739  * sequence don't cause load/free loops.
1740  */
1741  RP("%d cram_ref_load INCR %d => %d\n", gettid(), id, e->count+1);
1742  r->last = e;
1743  e->count++;
1744 
1745  return e;
1746 }
1747 
1748 /*
1749  * Returns a portion of a reference sequence from start to end inclusive.
1750  * The returned pointer is owned by either the cram_file fd or by the
1751  * internal refs_t structure and should not be freed by the caller.
1752  *
1753  * The difference is whether or not this refs_t is in use by just the one
1754  * cram_fd or by multiples, or whether we have multiple threads accessing
1755  * references. In either case fd->shared will be true and we start using
1756  * reference counting to track the number of users of a specific reference
1757  * sequence.
1758  *
1759  * Otherwise the ref seq returned is allocated as part of cram_fd itself
1760  * and will be freed up on the next call to cram_get_ref or cram_close.
1761  *
1762  * To return the entire reference sequence, specify start as 1 and end
1763  * as 0.
1764  *
1765  * To cease using a reference, call cram_ref_decr().
1766  *
1767  * Returns reference on success,
1768  * NULL on failure
1769  */
1770 char *cram_get_ref(cram_fd *fd, int id, int start, int end) {
1771  ref_entry *r;
1772  char *seq;
1773  int ostart = start;
1774 
1775  if (id == -1)
1776  return NULL;
1777 
1778  /* FIXME: axiomatic query of r->seq being true?
1779  * Or shortcut for unsorted data where we load once and never free?
1780  */
1781 
1782  //fd->shared_ref = 1; // hard code for now to simplify things
1783 
1784  pthread_mutex_lock(&fd->ref_lock);
1785 
1786  RP("%d cram_get_ref on fd %p, id %d, range %d..%d\n", gettid(), fd, id, start, end);
1787 
1788  /*
1789  * Unsorted data implies we want to fetch an entire reference at a time.
1790  * We just deal with this at the moment by claiming we're sharing
1791  * references instead, which has the same requirement.
1792  */
1793  if (fd->unsorted)
1794  fd->shared_ref = 1;
1795 
1796 
1797  /* Sanity checking: does this ID exist? */
1798  if (id >= fd->refs->nref) {
1799  fprintf(stderr, "No reference found for id %d\n", id);
1800  pthread_mutex_unlock(&fd->ref_lock);
1801  return NULL;
1802  }
1803 
1804  if (!fd->refs || !fd->refs->ref_id[id]) {
1805  fprintf(stderr, "No reference found for id %d\n", id);
1806  pthread_mutex_unlock(&fd->ref_lock);
1807  return NULL;
1808  }
1809 
1810  if (!(r = fd->refs->ref_id[id])) {
1811  fprintf(stderr, "No reference found for id %d\n", id);
1812  pthread_mutex_unlock(&fd->ref_lock);
1813  return NULL;
1814  }
1815 
1816 
1817  /*
1818  * It has an entry, but may not have been populated yet.
1819  * Any manually loaded .fai files have their lengths known.
1820  * A ref entry computed from @SQ lines (M5 or UR field) will have
1821  * r->length == 0 unless it's been loaded once and verified that we have
1822  * an on-disk filename for it.
1823  *
1824  * 19 Sep 2013: Moved the lock here as the cram_populate_ref code calls
1825  * open_path_mfile and libcurl, which isn't multi-thread safe unless I
1826  * rewrite my code to have one curl handle per thread.
1827  */
1828  pthread_mutex_lock(&fd->refs->lock);
1829  if (r->length == 0) {
1830  if (cram_populate_ref(fd, id, r) == -1) {
1831  fprintf(stderr, "Failed to populate reference for id %d\n", id);
1832  pthread_mutex_unlock(&fd->refs->lock);
1833  pthread_mutex_unlock(&fd->ref_lock);
1834  return NULL;
1835  }
1836  r = fd->refs->ref_id[id];
1837  }
1838 
1839 
1840  /*
1841  * We now know that we the filename containing the reference, so check
1842  * for limits. If it's over half the reference we'll load all of it in
1843  * memory as this will speed up subsequent calls.
1844  */
1845  if (end < 1)
1846  end = r->length;
1847  if (end >= r->length)
1848  end = r->length;
1849  assert(start >= 1);
1850 
1851  if (end - start >= 0.5*r->length || fd->shared_ref) {
1852  start = 1;
1853  end = r->length;
1854  }
1855 
1856  /*
1857  * Maybe we have it cached already? If so use it.
1858  *
1859  * Alternatively if we don't have the sequence but we're sharing
1860  * references and/or are asking for the entire length of it, then
1861  * load the full reference into the refs structure and return
1862  * a pointer to that one instead.
1863  */
1864  if (fd->shared_ref || r->seq || (start == 1 && end == r->length)) {
1865  char *cp;
1866 
1867  if (id >= 0) {
1868  if (r->seq) {
1869  cram_ref_incr_locked(fd->refs, id);
1870  } else {
1871  ref_entry *e;
1872  if (!(e = cram_ref_load(fd->refs, id))) {
1873  pthread_mutex_unlock(&fd->refs->lock);
1874  pthread_mutex_unlock(&fd->ref_lock);
1875  return NULL;
1876  }
1877 
1878  /* unsorted data implies cache ref indefinitely, to avoid
1879  * continually loading and unloading.
1880  */
1881  if (fd->unsorted)
1882  cram_ref_incr_locked(fd->refs, id);
1883  }
1884 
1885  fd->ref = NULL; /* We never access it directly */
1886  fd->ref_start = 1;
1887  fd->ref_end = r->length;
1888  fd->ref_id = id;
1889 
1890  cp = fd->refs->ref_id[id]->seq + ostart-1;
1891  } else {
1892  fd->ref = NULL;
1893  cp = NULL;
1894  }
1895 
1896  RP("%d cram_get_ref returning for id %d, count %d\n", gettid(), id, (int)r->count);
1897 
1898  pthread_mutex_unlock(&fd->refs->lock);
1899  pthread_mutex_unlock(&fd->ref_lock);
1900  return cp;
1901  }
1902 
1903  /*
1904  * Otherwise we're not sharing, we don't have a copy of it already and
1905  * we're only asking for a small portion of it.
1906  *
1907  * In this case load up just that segment ourselves, freeing any old
1908  * small segments in the process.
1909  */
1910 
1911  /* Unmapped ref ID */
1912  if (id < 0) {
1913  if (fd->ref_free) {
1914  free(fd->ref_free);
1915  fd->ref_free = NULL;
1916  }
1917  fd->ref = NULL;
1918  fd->ref_id = id;
1919  pthread_mutex_unlock(&fd->refs->lock);
1920  pthread_mutex_unlock(&fd->ref_lock);
1921  return NULL;
1922  }
1923 
1924  /* Open file if it's not already the current open reference */
1925  if (strcmp(fd->refs->fn, r->fn) || fd->refs->fp == NULL) {
1926  if (fd->refs->fp)
1927  fclose(fd->refs->fp);
1928  fd->refs->fn = r->fn;
1929  if (!(fd->refs->fp = fopen(fd->refs->fn, "r"))) {
1930  perror(fd->refs->fn);
1931  pthread_mutex_unlock(&fd->refs->lock);
1932  pthread_mutex_unlock(&fd->ref_lock);
1933  return NULL;
1934  }
1935  }
1936 
1937  if (!(fd->ref = load_ref_portion(fd->refs->fp, r, start, end))) {
1938  pthread_mutex_unlock(&fd->refs->lock);
1939  pthread_mutex_unlock(&fd->ref_lock);
1940  return NULL;
1941  }
1942 
1943  if (fd->ref_free)
1944  free(fd->ref_free);
1945 
1946  fd->ref_id = id;
1947  fd->ref_start = start;
1948  fd->ref_end = end;
1949  fd->ref_free = fd->ref;
1950  seq = fd->ref;
1951 
1952  pthread_mutex_unlock(&fd->refs->lock);
1953  pthread_mutex_unlock(&fd->ref_lock);
1954 
1955  return seq + ostart - start;
1956 }
1957 
1958 /*
1959  * If fd has been opened for reading, it may be permitted to specify 'fn'
1960  * as NULL and let the code auto-detect the reference by parsing the
1961  * SAM header @SQ lines.
1962  */
1963 int cram_load_reference(cram_fd *fd, char *fn) {
1964  if (fn) {
1965  fd->refs = refs_load_fai(fd->refs, fn,
1966  !(fd->embed_ref && fd->mode == 'r'));
1967  fn = fd->refs ? fd->refs->fn : NULL;
1968  }
1969  fd->ref_fn = fn;
1970 
1971  if ((!fd->refs || (fd->refs->nref == 0 && !fn)) && fd->header) {
1972  if (!(fd->refs = refs_create()))
1973  return -1;
1974  if (-1 == refs_from_header(fd->refs, fd, fd->header))
1975  return -1;
1976  }
1977 
1978  if (-1 == refs2id(fd->refs, fd->header))
1979  return -1;
1980 
1981  return fn ? 0 : -1;
1982 }
1983 
1984 /* ----------------------------------------------------------------------
1985  * Containers
1986  */
1987 
1988 /*
1989  * Creates a new container, specifying the maximum number of slices
1990  * and records permitted.
1991  *
1992  * Returns cram_container ptr on success
1993  * NULL on failure
1994  */
1995 cram_container *cram_new_container(int nrec, int nslice) {
1996  cram_container *c = calloc(1, sizeof(*c));
1997  if (!c)
1998  return NULL;
1999 
2000  c->curr_ref = -2;
2001 
2002  c->max_c_rec = nrec * nslice;
2003  c->curr_c_rec = 0;
2004 
2005  c->max_rec = nrec;
2006  c->record_counter = 0;
2007  c->num_bases = 0;
2008 
2009  c->max_slice = nslice;
2010  c->curr_slice = 0;
2011 
2012  c->pos_sorted = 1;
2013  c->max_apos = 0;
2014  c->multi_seq = 0;
2015 
2016  c->bams = NULL;
2017 
2018  if (!(c->slices = (cram_slice **)calloc(nslice, sizeof(cram_slice *))))
2019  goto err;
2020  c->slice = NULL;
2021 
2022  if (!(c->comp_hdr = cram_new_compression_header()))
2023  goto err;
2024  c->comp_hdr_block = NULL;
2025 
2026  if (!(c->BF_stats = cram_stats_create())) goto err;
2027  if (!(c->CF_stats = cram_stats_create())) goto err;
2028  if (!(c->RN_stats = cram_stats_create())) goto err;
2029  if (!(c->AP_stats = cram_stats_create())) goto err;
2030  if (!(c->RG_stats = cram_stats_create())) goto err;
2031  if (!(c->MQ_stats = cram_stats_create())) goto err;
2032  if (!(c->NS_stats = cram_stats_create())) goto err;
2033  if (!(c->NP_stats = cram_stats_create())) goto err;
2034  if (!(c->TS_stats = cram_stats_create())) goto err;
2035  if (!(c->MF_stats = cram_stats_create())) goto err;
2036  if (!(c->NF_stats = cram_stats_create())) goto err;
2037  if (!(c->RL_stats = cram_stats_create())) goto err;
2038  if (!(c->FN_stats = cram_stats_create())) goto err;
2039  if (!(c->FC_stats = cram_stats_create())) goto err;
2040  if (!(c->FP_stats = cram_stats_create())) goto err;
2041  if (!(c->DL_stats = cram_stats_create())) goto err;
2042  if (!(c->BA_stats = cram_stats_create())) goto err;
2043  if (!(c->QS_stats = cram_stats_create())) goto err;
2044  if (!(c->BS_stats = cram_stats_create())) goto err;
2045  if (!(c->TC_stats = cram_stats_create())) goto err;
2046  if (!(c->TN_stats = cram_stats_create())) goto err;
2047  if (!(c->TL_stats = cram_stats_create())) goto err;
2048  if (!(c->RI_stats = cram_stats_create())) goto err;
2049  if (!(c->RS_stats = cram_stats_create())) goto err;
2050  if (!(c->PD_stats = cram_stats_create())) goto err;
2051  if (!(c->HC_stats = cram_stats_create())) goto err;
2052 
2053  //c->aux_B_stats = cram_stats_create();
2054 
2055  if (!(c->tags_used = kh_init(s_i2i)))
2056  goto err;
2057  c->refs_used = 0;
2058 
2059  return c;
2060 
2061  err:
2062  if (c) {
2063  if (c->slices)
2064  free(c->slices);
2065  free(c);
2066  }
2067  return NULL;
2068 }
2069 
2071  int i;
2072 
2073  if (!c)
2074  return;
2075 
2076  if (c->refs_used)
2077  free(c->refs_used);
2078 
2079  if (c->landmark)
2080  free(c->landmark);
2081 
2082  if (c->comp_hdr)
2084 
2085  if (c->comp_hdr_block)
2087 
2088  if (c->slices) {
2089  for (i = 0; i < c->max_slice; i++)
2090  if (c->slices[i])
2091  cram_free_slice(c->slices[i]);
2092  free(c->slices);
2093  }
2094 
2095  if (c->TS_stats) cram_stats_free(c->TS_stats);
2096  if (c->RG_stats) cram_stats_free(c->RG_stats);
2097  if (c->FP_stats) cram_stats_free(c->FP_stats);
2098  if (c->NS_stats) cram_stats_free(c->NS_stats);
2099  if (c->RN_stats) cram_stats_free(c->RN_stats);
2100  if (c->CF_stats) cram_stats_free(c->CF_stats);
2101  if (c->TN_stats) cram_stats_free(c->TN_stats);
2102  if (c->BA_stats) cram_stats_free(c->BA_stats);
2103  if (c->TV_stats) cram_stats_free(c->TV_stats);
2104  if (c->BS_stats) cram_stats_free(c->BS_stats);
2105  if (c->FC_stats) cram_stats_free(c->FC_stats);
2106  if (c->BF_stats) cram_stats_free(c->BF_stats);
2107  if (c->AP_stats) cram_stats_free(c->AP_stats);
2108  if (c->NF_stats) cram_stats_free(c->NF_stats);
2109  if (c->MF_stats) cram_stats_free(c->MF_stats);
2110  if (c->FN_stats) cram_stats_free(c->FN_stats);
2111  if (c->RL_stats) cram_stats_free(c->RL_stats);
2112  if (c->DL_stats) cram_stats_free(c->DL_stats);
2113  if (c->TC_stats) cram_stats_free(c->TC_stats);
2114  if (c->TL_stats) cram_stats_free(c->TL_stats);
2115  if (c->MQ_stats) cram_stats_free(c->MQ_stats);
2116  if (c->TM_stats) cram_stats_free(c->TM_stats);
2117  if (c->QS_stats) cram_stats_free(c->QS_stats);
2118  if (c->NP_stats) cram_stats_free(c->NP_stats);
2119  if (c->RI_stats) cram_stats_free(c->RI_stats);
2120  if (c->RS_stats) cram_stats_free(c->RS_stats);
2121  if (c->PD_stats) cram_stats_free(c->PD_stats);
2122  if (c->HC_stats) cram_stats_free(c->HC_stats);
2123 
2124  //if (c->aux_B_stats) cram_stats_free(c->aux_B_stats);
2125 
2126  if (c->tags_used) kh_destroy(s_i2i, c->tags_used);
2127 
2128  free(c);
2129 }
2130 
2131 /*
2132  * Reads a container header.
2133  *
2134  * Returns cram_container on success
2135  * NULL on failure or no container left (fd->err == 0).
2136  */
2138  cram_container c2, *c;
2139  int i, s;
2140  size_t rd = 0;
2141 
2142  fd->err = 0;
2143 
2144  memset(&c2, 0, sizeof(c2));
2145  if (fd->version == CRAM_1_VERS) {
2146  if ((s = itf8_decode(fd, &c2.length)) == -1) {
2147  fd->eof = fd->empty_container ? 1 : 2;
2148  return NULL;
2149  } else {
2150  rd+=s;
2151  }
2152  } else {
2153  if ((s = int32_decode(fd, &c2.length)) == -1) {
2154  fd->eof = fd->empty_container ? 1 : 2;
2155  return NULL;
2156  } else {
2157  rd+=s;
2158  }
2159  }
2160  if ((s = itf8_decode(fd, &c2.ref_seq_id)) == -1) return NULL; else rd+=s;
2161  if ((s = itf8_decode(fd, &c2.ref_seq_start))== -1) return NULL; else rd+=s;
2162  if ((s = itf8_decode(fd, &c2.ref_seq_span)) == -1) return NULL; else rd+=s;
2163  if ((s = itf8_decode(fd, &c2.num_records)) == -1) return NULL; else rd+=s;
2164 
2165  if (fd->version == CRAM_1_VERS) {
2166  c2.record_counter = 0;
2167  c2.num_bases = 0;
2168  } else {
2169  if ((s = itf8_decode(fd, &c2.record_counter)) == -1)
2170  return NULL;
2171  else
2172  rd += s;
2173 
2174  if ((s = ltf8_decode(fd, &c2.num_bases))== -1)
2175  return NULL;
2176  else
2177  rd += s;
2178  }
2179  if ((s = itf8_decode(fd, &c2.num_blocks)) == -1) return NULL; else rd+=s;
2180  if ((s = itf8_decode(fd, &c2.num_landmarks))== -1) return NULL; else rd+=s;
2181 
2182  if (!(c = calloc(1, sizeof(*c))))
2183  return NULL;
2184 
2185  *c = c2;
2186 
2187  if (!(c->landmark = malloc(c->num_landmarks * sizeof(int32_t))) &&
2188  c->num_landmarks) {
2189  fd->err = errno;
2191  return NULL;
2192  }
2193  for (i = 0; i < c->num_landmarks; i++) {
2194  if ((s = itf8_decode(fd, &c->landmark[i])) == -1) {
2196  return NULL;
2197  } else {
2198  rd += s;
2199  }
2200  }
2201  c->offset = rd;
2202 
2203  c->slices = NULL;
2204  c->curr_slice = 0;
2205  c->max_slice = c->num_landmarks;
2206  c->slice_rec = 0;
2207  c->curr_rec = 0;
2208  c->max_rec = 0;
2209 
2210  if (c->ref_seq_id == -2) {
2211  c->multi_seq = 1;
2212  fd->multi_seq = 1;
2213  }
2214 
2215  fd->empty_container =
2216  (c->num_records == 0 &&
2217  c->ref_seq_id == -1 &&
2218  c->ref_seq_start == 0x454f46 /* EOF */) ? 1 : 0;
2219 
2220  return c;
2221 }
2222 
2223 /*
2224  * Writes a container structure.
2225  *
2226  * Returns 0 on success
2227  * -1 on failure
2228  */
2230  char buf_a[1024], *buf = buf_a, *cp;
2231  int i;
2232 
2233  if (50 + c->num_landmarks * 5 >= 1024)
2234  buf = malloc(50 + c->num_landmarks * 5);
2235  cp = buf;
2236 
2237  if (fd->version == CRAM_1_VERS) {
2238  cp += itf8_put(cp, c->length);
2239  } else {
2240  *(int32_t *)cp = le_int4(c->length);
2241  cp += 4;
2242  }
2243  if (c->multi_seq) {
2244  cp += itf8_put(cp, -2);
2245  cp += itf8_put(cp, 0);
2246  cp += itf8_put(cp, 0);
2247  } else {
2248  cp += itf8_put(cp, c->ref_seq_id);
2249  cp += itf8_put(cp, c->ref_seq_start);
2250  cp += itf8_put(cp, c->ref_seq_span);
2251  }
2252  cp += itf8_put(cp, c->num_records);
2253  if (fd->version != CRAM_1_VERS) {
2254  cp += itf8_put(cp, c->record_counter);
2255  cp += ltf8_put(cp, c->num_bases);
2256  }
2257  cp += itf8_put(cp, c->num_blocks);
2258  cp += itf8_put(cp, c->num_landmarks);
2259  for (i = 0; i < c->num_landmarks; i++)
2260  cp += itf8_put(cp, c->landmark[i]);
2261  if (cp-buf != hwrite(fd->fp, buf, cp-buf)) {
2262  if (buf != buf_a)
2263  free(buf);
2264  return -1;
2265  }
2266 
2267  if (buf != buf_a)
2268  free(buf);
2269 
2270  return 0;
2271 }
2272 
2273 // common component shared by cram_flush_container{,_mt}
2274 static int cram_flush_container2(cram_fd *fd, cram_container *c) {
2275  int i, j;
2276 
2277  //fprintf(stderr, "Writing container %d, sum %u\n", c->record_counter, sum);
2278 
2279  /* Write the container struct itself */
2280  if (0 != cram_write_container(fd, c))
2281  return -1;
2282 
2283  /* And the compression header */
2284  if (0 != cram_write_block(fd, c->comp_hdr_block))
2285  return -1;
2286 
2287  /* Followed by the slice blocks */
2288  for (i = 0; i < c->curr_slice; i++) {
2289  cram_slice *s = c->slices[i];
2290 
2291  if (0 != cram_write_block(fd, s->hdr_block))
2292  return -1;
2293 
2294  for (j = 0; j < s->hdr->num_blocks; j++) {
2295  if (0 != cram_write_block(fd, s->block[j]))
2296  return -1;
2297  }
2298  }
2299 
2300  return hflush(fd->fp) == 0 ? 0 : -1;
2301 }
2302 
2303 /*
2304  * Flushes a completely or partially full container to disk, writing
2305  * container structure, header and blocks. This also calls the encoder
2306  * functions.
2307  *
2308  * Returns 0 on success
2309  * -1 on failure
2310  */
2312  /* Encode the container blocks and generate compression header */
2313  if (0 != cram_encode_container(fd, c))
2314  return -1;
2315 
2316  return cram_flush_container2(fd, c);
2317 }
2318 
2319 typedef struct {
2322 } cram_job;
2323 
2324 void *cram_flush_thread(void *arg) {
2325  cram_job *j = (cram_job *)arg;
2326 
2327  /* Encode the container blocks and generate compression header */
2328  if (0 != cram_encode_container(j->fd, j->c)) {
2329  fprintf(stderr, "cram_encode_container failed\n");
2330  return NULL;
2331  }
2332 
2333  return arg;
2334 }
2335 
2336 static int cram_flush_result(cram_fd *fd) {
2337  int i, ret = 0;
2338  t_pool_result *r;
2339 
2340  while ((r = t_pool_next_result(fd->rqueue))) {
2341  cram_job *j = (cram_job *)r->data;
2342  cram_container *c;
2343 
2344  if (!j) {
2345  t_pool_delete_result(r, 0);
2346  return -1;
2347  }
2348 
2349  fd = j->fd;
2350  c = j->c;
2351 
2352  if (0 != cram_flush_container2(fd, c))
2353  return -1;
2354 
2355  /* Free the container */
2356  for (i = 0; i < c->max_slice; i++) {
2357  cram_free_slice(c->slices[i]);
2358  c->slices[i] = NULL;
2359  }
2360 
2361  c->slice = NULL;
2362  c->curr_slice = 0;
2363 
2365 
2366  ret |= hflush(fd->fp) == 0 ? 0 : -1;
2367 
2368  t_pool_delete_result(r, 1);
2369  }
2370 
2371  return ret;
2372 }
2373 
2375  cram_job *j;
2376 
2377  if (!fd->pool)
2378  return cram_flush_container(fd, c);
2379 
2380  if (!(j = malloc(sizeof(*j))))
2381  return -1;
2382  j->fd = fd;
2383  j->c = c;
2384 
2386 
2387  return cram_flush_result(fd);
2388 }
2389 
2390 /* ----------------------------------------------------------------------
2391  * Compression headers; the first part of the container
2392  */
2393 
2394 /*
2395  * Creates a new blank container compression header
2396  *
2397  * Returns header ptr on success
2398  * NULL on failure
2399  */
2401  cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr));
2402  if (!hdr)
2403  return NULL;
2404 
2405  if (!(hdr->TD_blk = cram_new_block(CORE, 0))) {
2406  free(hdr);
2407  return NULL;
2408  }
2409 
2410  if (!(hdr->TD_hash = kh_init(m_s2i))) {
2411  cram_free_block(hdr->TD_blk);
2412  free(hdr);
2413  return NULL;
2414  }
2415 
2416  if (!(hdr->TD_keys = string_pool_create(8192))) {
2417  kh_destroy(m_s2i, hdr->TD_hash);
2418  cram_free_block(hdr->TD_blk);
2419  free(hdr);
2420  return NULL;
2421  }
2422 
2423  return hdr;
2424 }
2425 
2427  int i;
2428 
2429  if (hdr->landmark)
2430  free(hdr->landmark);
2431 
2432  if (hdr->preservation_map)
2433  kh_destroy(map, hdr->preservation_map);
2434 
2435  for (i = 0; i < CRAM_MAP_HASH; i++) {
2436  cram_map *m, *m2;
2437  for (m = hdr->rec_encoding_map[i]; m; m = m2) {
2438  m2 = m->next;
2439  if (m->codec)
2440  m->codec->free(m->codec);
2441  free(m);
2442  }
2443  }
2444 
2445  for (i = 0; i < CRAM_MAP_HASH; i++) {
2446  cram_map *m, *m2;
2447  for (m = hdr->tag_encoding_map[i]; m; m = m2) {
2448  m2 = m->next;
2449  if (m->codec)
2450  m->codec->free(m->codec);
2451  free(m);
2452  }
2453  }
2454 
2455  if (hdr->BF_codec) hdr->BF_codec->free(hdr->BF_codec);
2456  if (hdr->CF_codec) hdr->CF_codec->free(hdr->CF_codec);
2457  if (hdr->RL_codec) hdr->RL_codec->free(hdr->RL_codec);
2458  if (hdr->AP_codec) hdr->AP_codec->free(hdr->AP_codec);
2459  if (hdr->RG_codec) hdr->RG_codec->free(hdr->RG_codec);
2460  if (hdr->MF_codec) hdr->MF_codec->free(hdr->MF_codec);
2461  if (hdr->NS_codec) hdr->NS_codec->free(hdr->NS_codec);
2462  if (hdr->NP_codec) hdr->NP_codec->free(hdr->NP_codec);
2463  if (hdr->TS_codec) hdr->TS_codec->free(hdr->TS_codec);
2464  if (hdr->NF_codec) hdr->NF_codec->free(hdr->NF_codec);
2465  if (hdr->TC_codec) hdr->TC_codec->free(hdr->TC_codec);
2466  if (hdr->TN_codec) hdr->TN_codec->free(hdr->TN_codec);
2467  if (hdr->TL_codec) hdr->TL_codec->free(hdr->TL_codec);
2468  if (hdr->FN_codec) hdr->FN_codec->free(hdr->FN_codec);
2469  if (hdr->FC_codec) hdr->FC_codec->free(hdr->FC_codec);
2470  if (hdr->FP_codec) hdr->FP_codec->free(hdr->FP_codec);
2471  if (hdr->BS_codec) hdr->BS_codec->free(hdr->BS_codec);
2472  if (hdr->IN_codec) hdr->IN_codec->free(hdr->IN_codec);
2473  if (hdr->SC_codec) hdr->SC_codec->free(hdr->SC_codec);
2474  if (hdr->DL_codec) hdr->DL_codec->free(hdr->DL_codec);
2475  if (hdr->BA_codec) hdr->BA_codec->free(hdr->BA_codec);
2476  if (hdr->MQ_codec) hdr->MQ_codec->free(hdr->MQ_codec);
2477  if (hdr->RN_codec) hdr->RN_codec->free(hdr->RN_codec);
2478  if (hdr->QS_codec) hdr->QS_codec->free(hdr->QS_codec);
2479  if (hdr->Qs_codec) hdr->Qs_codec->free(hdr->Qs_codec);
2480  if (hdr->RI_codec) hdr->RI_codec->free(hdr->RI_codec);
2481  if (hdr->RS_codec) hdr->RS_codec->free(hdr->RS_codec);
2482  if (hdr->PD_codec) hdr->PD_codec->free(hdr->PD_codec);
2483  if (hdr->HC_codec) hdr->HC_codec->free(hdr->HC_codec);
2484 
2485  if (hdr->TL)
2486  free(hdr->TL);
2487  if (hdr->TD_blk)
2488  cram_free_block(hdr->TD_blk);
2489  if (hdr->TD_hash)
2490  kh_destroy(m_s2i, hdr->TD_hash);
2491  if (hdr->TD_keys)
2493 
2494  free(hdr);
2495 }
2496 
2497 
2498 /* ----------------------------------------------------------------------
2499  * Slices and slice headers
2500  */
2501 
2503  if (!hdr)
2504  return;
2505 
2506  if (hdr->block_content_ids)
2507  free(hdr->block_content_ids);
2508 
2509  free(hdr);
2510 
2511  return;
2512 }
2513 
2515  if (!s)
2516  return;
2517 
2518  if (s->hdr_block)
2520 
2521  if (s->block) {
2522  int i;
2523 
2524  if (s->hdr) {
2525  for (i = 0; i < s->hdr->num_blocks; i++) {
2526  cram_free_block(s->block[i]);
2527  }
2528  }
2529  free(s->block);
2530  }
2531 
2532  if (s->block_by_id)
2533  free(s->block_by_id);
2534 
2535  if (s->hdr)
2537 
2538  if (s->seqs_blk)
2540 
2541  if (s->qual_blk)
2543 
2544  if (s->name_blk)
2546 
2547  if (s->aux_blk)
2549 
2550  if (s->base_blk)
2552 
2553  if (s->soft_blk)
2555 
2556 #ifdef TN_external
2557  if (s->tn_blk)
2558  cram_free_block(s->tn_blk);
2559 #endif
2560 
2561  if (s->cigar)
2562  free(s->cigar);
2563 
2564  if (s->crecs)
2565  free(s->crecs);
2566 
2567  if (s->features)
2568  free(s->features);
2569 
2570 #ifndef TN_external
2571  if (s->TN)
2572  free(s->TN);
2573 #endif
2574 
2575  if (s->pair_keys)
2577 
2578  if (s->pair)
2579  kh_destroy(m_s2i, s->pair);
2580 
2581  free(s);
2582 }
2583 
2584 /*
2585  * Creates a new empty slice in memory, for subsequent writing to
2586  * disk.
2587  *
2588  * Returns cram_slice ptr on success
2589  * NULL on failure
2590  */
2592  cram_slice *s = calloc(1, sizeof(*s));
2593  if (!s)
2594  return NULL;
2595 
2596  if (!(s->hdr = (cram_block_slice_hdr *)calloc(1, sizeof(*s->hdr))))
2597  goto err;
2598  s->hdr->content_type = type;
2599 
2600  s->hdr_block = NULL;
2601  s->block = NULL;
2602  s->block_by_id = NULL;
2603  s->last_apos = 0;
2604  s->id = 0;
2605  if (!(s->crecs = malloc(nrecs * sizeof(cram_record)))) goto err;
2606  s->cigar = NULL;
2607  s->cigar_alloc = 0;
2608  s->ncigar = 0;
2609 
2610  if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0))) goto err;
2611  if (!(s->qual_blk = cram_new_block(EXTERNAL, CRAM_EXT_QUAL))) goto err;
2612  if (!(s->name_blk = cram_new_block(EXTERNAL, CRAM_EXT_NAME))) goto err;
2613  if (!(s->aux_blk = cram_new_block(EXTERNAL, CRAM_EXT_TAG))) goto err;
2614  if (!(s->base_blk = cram_new_block(EXTERNAL, CRAM_EXT_IN))) goto err;
2615  if (!(s->soft_blk = cram_new_block(EXTERNAL, CRAM_EXT_SC))) goto err;
2616 #ifdef TN_external
2617  if (!(s->tn_blk = cram_new_block(EXTERNAL, CRAM_EXT_TN))) goto err;
2618 #endif
2619 
2620  s->features = NULL;
2621  s->nfeatures = s->afeatures = 0;
2622 
2623 #ifndef TN_external
2624  s->TN = NULL;
2625  s->nTN = s->aTN = 0;
2626 #endif
2627 
2628  // Volatile keys as we do realloc in dstring
2629  if (!(s->pair_keys = string_pool_create(8192))) goto err;
2630  if (!(s->pair = kh_init(m_s2i))) goto err;
2631 
2632 #ifdef BA_external
2633  s->BA_len = 0;
2634 #endif
2635 
2636  return s;
2637 
2638  err:
2639  if (s)
2640  cram_free_slice(s);
2641 
2642  return NULL;
2643 }
2644 
2645 /*
2646  * Loads an entire slice.
2647  * FIXME: In 1.0 the native unit of slices within CRAM is broken
2648  * as slices contain references to objects in other slices.
2649  * To work around this while keeping the slice oriented outer loop
2650  * we read all slices and stitch them together into a fake large
2651  * slice instead.
2652  *
2653  * Returns cram_slice ptr on success
2654  * NULL on failure
2655  */
2657  cram_block *b = cram_read_block(fd);
2658  cram_slice *s = calloc(1, sizeof(*s));
2659  int i, n, max_id, min_id;
2660 
2661  if (!b || !s)
2662  goto err;
2663 
2664  s->hdr_block = b;
2665  switch (b->content_type) {
2666  case MAPPED_SLICE:
2667  case UNMAPPED_SLICE:
2668  if (!(s->hdr = cram_decode_slice_header(fd, b)))
2669  goto err;
2670  break;
2671 
2672  default:
2673  fprintf(stderr, "Unexpected block of type %s\n",
2675  goto err;
2676  }
2677 
2678  s->block = calloc(n = s->hdr->num_blocks, sizeof(*s->block));
2679  if (!s->block)
2680  goto err;
2681 
2682  for (max_id = i = 0, min_id = INT_MAX; i < n; i++) {
2683  if (!(s->block[i] = cram_read_block(fd)))
2684  goto err;
2685 
2686  if (s->block[i]->content_type == EXTERNAL) {
2687  if (max_id < s->block[i]->content_id)
2688  max_id = s->block[i]->content_id;
2689  if (min_id > s->block[i]->content_id)
2690  min_id = s->block[i]->content_id;
2691  }
2692  }
2693  if (min_id >= 0 && max_id < 1024) {
2694  if (!(s->block_by_id = calloc(1024, sizeof(s->block[0]))))
2695  goto err;
2696 
2697  for (i = 0; i < n; i++) {
2698  if (s->block[i]->content_type != EXTERNAL)
2699  continue;
2700  s->block_by_id[s->block[i]->content_id] = s->block[i];
2701  }
2702  }
2703 
2704  /* Initialise encoding/decoding tables */
2705  s->cigar = NULL;
2706  s->cigar_alloc = 0;
2707  s->ncigar = 0;
2708 
2709  if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0))) goto err;
2710  if (!(s->qual_blk = cram_new_block(EXTERNAL, CRAM_EXT_QUAL))) goto err;
2711  if (!(s->name_blk = cram_new_block(EXTERNAL, CRAM_EXT_NAME))) goto err;
2712  if (!(s->aux_blk = cram_new_block(EXTERNAL, CRAM_EXT_TAG))) goto err;
2713  if (!(s->base_blk = cram_new_block(EXTERNAL, CRAM_EXT_IN))) goto err;
2714  if (!(s->soft_blk = cram_new_block(EXTERNAL, CRAM_EXT_SC))) goto err;
2715 #ifdef TN_external
2716  if (!(s->tn_blk = cram_new_block(EXTERNAL, CRAM_EXT_TN))) goto err;
2717 #endif
2718 
2719 
2720  s->crecs = NULL;
2721 
2722  s->last_apos = s->hdr->ref_seq_start;
2723 
2724  s->id = fd->slice_num++;
2725 
2726  return s;
2727 
2728  err:
2729  if (b)
2730  cram_free_block(b);
2731  if (s) {
2732  s->hdr_block = NULL;
2733  cram_free_slice(s);
2734  }
2735  return NULL;
2736 }
2737 
2738 
2739 /* ----------------------------------------------------------------------
2740  * CRAM file definition (header)
2741  */
2742 
2743 /*
2744  * Reads a CRAM file definition structure.
2745  * Returns file_def ptr on success
2746  * NULL on failure
2747  */
2749  cram_file_def *def = malloc(sizeof(*def));
2750  if (!def)
2751  return NULL;
2752 
2753  if (26 != hread(fd->fp, &def->magic[0], 26)) {
2754  free(def);
2755  return NULL;
2756  }
2757 
2758  if (memcmp(def->magic, "CRAM", 4) != 0) {
2759  free(def);
2760  return NULL;
2761  }
2762 
2763  if (def->major_version > 2) {
2764  fprintf(stderr, "CRAM version number mismatch\n"
2765  "Expected 1.x or 2.x, got %d.%d\n",
2766  def->major_version, def->minor_version);
2767  free(def);
2768  return NULL;
2769  }
2770 
2771  fd->first_container += 26;
2772  fd->last_slice = 0;
2773 
2774  return def;
2775 }
2776 
2777 /*
2778  * Writes a cram_file_def structure to cram_fd.
2779  * Returns 0 on success
2780  * -1 on failure
2781  */
2783  return (hwrite(fd->fp, &def->magic[0], 26) == 26) ? 0 : -1;
2784 }
2785 
2787  if (def) free(def);
2788 }
2789 
2790 /* ----------------------------------------------------------------------
2791  * SAM header I/O
2792  */
2793 
2794 
2795 /*
2796  * Reads the SAM header from the first CRAM data block.
2797  * Also performs minimal parsing to extract read-group
2798  * and sample information.
2799 
2800  * Returns SAM hdr ptr on success
2801  * NULL on failure
2802  */
2804  int32_t header_len;
2805  char *header;
2806  SAM_hdr *hdr;
2807 
2808  /* 1.1 onwards stores the header in the first block of a container */
2809  if (fd->version == CRAM_1_VERS) {
2810  /* Length */
2811  if (-1 == int32_decode(fd, &header_len))
2812  return NULL;
2813 
2814  /* Alloc and read */
2815  if (NULL == (header = malloc(header_len+1)))
2816  return NULL;
2817 
2818  *header = 0;
2819  if (header_len != hread(fd->fp, header, header_len))
2820  return NULL;
2821 
2822  fd->first_container += 4 + header_len;
2823  } else {
2825  cram_block *b;
2826  int i, len;
2827 
2828  if (!c)
2829  return NULL;
2830 
2831  if (c->num_blocks < 1) {
2833  return NULL;
2834  }
2835 
2836  if (!(b = cram_read_block(fd))) {
2838  return NULL;
2839  }
2840 
2841  len = b->comp_size + 2 +
2842  itf8_size(b->content_id) +
2843  itf8_size(b->uncomp_size) +
2844  itf8_size(b->comp_size);
2845 
2846  /* Extract header from 1st block */
2847  if (-1 == int32_get(b, &header_len) ||
2848  b->uncomp_size - 4 < header_len) {
2850  cram_free_block(b);
2851  return NULL;
2852  }
2853  if (NULL == (header = malloc(header_len))) {
2855  cram_free_block(b);
2856  return NULL;
2857  }
2858  memcpy(header, BLOCK_END(b), header_len);
2859  cram_free_block(b);
2860 
2861  /* Consume any remaining blocks */
2862  for (i = 1; i < c->num_blocks; i++) {
2863  if (!(b = cram_read_block(fd))) {
2865  return NULL;
2866  }
2867  len += b->comp_size + 2 +
2868  itf8_size(b->content_id) +
2869  itf8_size(b->uncomp_size) +
2870  itf8_size(b->comp_size);
2871  cram_free_block(b);
2872  }
2873 
2874  if (c->length && c->length > len) {
2875  // Consume padding
2876  char *pads = malloc(c->length - len);
2877  if (!pads) {
2879  return NULL;
2880  }
2881 
2882  if (c->length - len != hread(fd->fp, pads, c->length - len)) {
2884  return NULL;
2885  }
2886  free(pads);
2887  }
2888 
2890  }
2891 
2892  /* Parse */
2893 #ifdef SAMTOOLS
2894  hdr = sam_hdr_parse_(header, header_len);
2895 #else
2896  hdr = sam_hdr_parse(header, header_len);
2897 #endif
2898  free(header);
2899 
2900  return hdr;
2901 }
2902 
2903 /*
2904  * Converts 'in' to a full pathname to store in out.
2905  * Out must be at least PATH_MAX bytes long.
2906  */
2907 static void full_path(char *out, char *in) {
2908  if (*in == '/') {
2909  strncpy(out, in, PATH_MAX);
2910  out[PATH_MAX-1] = 0;
2911  } else {
2912  int len;
2913 
2914  // unable to get dir or out+in is too long
2915  if (!getcwd(out, PATH_MAX) ||
2916  (len = strlen(out))+1+strlen(in) >= PATH_MAX) {
2917  strncpy(out, in, PATH_MAX);
2918  out[PATH_MAX-1] = 0;
2919  return;
2920  }
2921 
2922  sprintf(out+len, "/%.*s", PATH_MAX - len, in);
2923 
2924  // FIXME: cope with `pwd`/../../../foo.fa ?
2925  }
2926 }
2927 
2928 /*
2929  * Writes a CRAM SAM header.
2930  * Returns 0 on success
2931  * -1 on failure
2932  */
2933 //#define BLANK_BLOCK
2934 //#define PADDED_CONTAINER
2935 #define PADDED_BLOCK
2937  int header_len;
2938 
2939  /* 1.0 requires and UNKNOWN read-group */
2940  if (fd->version == CRAM_1_VERS) {
2941  if (!sam_hdr_find_rg(hdr, "UNKNOWN"))
2942  if (sam_hdr_add(hdr, "RG",
2943  "ID", "UNKNOWN", "SM", "UNKNOWN", NULL))
2944  return -1;
2945  }
2946 
2947  /* Fix M5 strings */
2948  if (fd->refs && !fd->no_ref) {
2949  int i;
2950  for (i = 0; i < hdr->nref; i++) {
2951  SAM_hdr_type *ty;
2952  char *ref;
2953 
2954  if (!(ty = sam_hdr_find(hdr, "SQ", "SN", hdr->ref[i].name)))
2955  return -1;
2956 
2957  if (!sam_hdr_find_key(hdr, ty, "M5", NULL)) {
2958  char unsigned buf[16], buf2[33];
2959  int j, rlen;
2960  MD5_CTX md5;
2961 
2962  if (!fd->refs ||
2963  !fd->refs->ref_id ||
2964  !fd->refs->ref_id[i]) {
2965  return -1;
2966  }
2967  rlen = fd->refs->ref_id[i]->length;
2968  MD5_Init(&md5);
2969  ref = cram_get_ref(fd, i, 1, rlen);
2970  if (NULL == ref) return -1;
2971  rlen = fd->refs->ref_id[i]->length; /* In case it just loaded */
2972  MD5_Update(&md5, ref, rlen);
2973  MD5_Final(buf, &md5);
2974  cram_ref_decr(fd->refs, i);
2975 
2976  for (j = 0; j < 16; j++) {
2977  buf2[j*2+0] = "0123456789abcdef"[buf[j]>>4];
2978  buf2[j*2+1] = "0123456789abcdef"[buf[j]&15];
2979  }
2980  buf2[32] = 0;
2981  if (sam_hdr_update(hdr, ty, "M5", buf2, NULL))
2982  return -1;
2983  }
2984 
2985  if (fd->ref_fn) {
2986  char ref_fn[PATH_MAX];
2987  full_path(ref_fn, fd->ref_fn);
2988  if (sam_hdr_update(hdr, ty, "UR", ref_fn, NULL))
2989  return -1;
2990  }
2991  }
2992  }
2993 
2994  if (sam_hdr_rebuild(hdr))
2995  return -1;
2996 
2997  /* Length */
2998  header_len = sam_hdr_length(hdr);
2999  if (fd->version == CRAM_1_VERS) {
3000  if (-1 == int32_encode(fd, header_len))
3001  return -1;
3002 
3003  /* Text data */
3004  if (header_len != hwrite(fd->fp, sam_hdr_str(hdr), header_len))
3005  return -1;
3006  } else {
3007  /* Create a block inside a container */
3010  int padded_length;
3011  char *pads;
3012 
3013  if (!b || !c) {
3014  if (b) cram_free_block(b);
3015  if (c) cram_free_container(c);
3016  return -1;
3017  }
3018 
3019  int32_put(b, header_len);
3020  BLOCK_APPEND(b, sam_hdr_str(hdr), header_len);
3021  BLOCK_UPLEN(b);
3022 
3023 #ifndef BLANK_BLOCK
3024  c->num_blocks = 1;
3025  c->num_landmarks = 1;
3026  if (!(c->landmark = malloc(sizeof(*c->landmark)))) {
3027  cram_free_block(b);
3029  return -1;
3030  }
3031  c->landmark[0] = 0;
3032 
3033  c->length = b->uncomp_size + 2 +
3034  itf8_size(b->content_id) +
3035  itf8_size(b->uncomp_size) +
3036  itf8_size(b->comp_size);
3037 #else
3038  c->length = b->uncomp_size + 2 +
3039  itf8_size(b->content_id) +
3040  itf8_size(b->uncomp_size) +
3041  itf8_size(b->comp_size);
3042 
3043  c->num_blocks = 2;
3044  c->num_landmarks = 2;
3045  if (!(c->landmark = malloc(2*sizeof(*c->landmark))))
3046  return -1;
3047  c->landmark[0] = 0;
3048  c->landmark[1] = c->length;
3049 
3050  c->length *= 2;
3051 #endif
3052 
3053 #ifdef PADDED_BLOCK
3054  padded_length = MAX(c->length*1.5, 10000) - c->length;
3055  c->length += padded_length;
3056  if (NULL == (pads = calloc(1, padded_length))) {
3057  cram_free_block(b);
3059  return -1;
3060  }
3061  BLOCK_APPEND(b, pads, padded_length);
3062  BLOCK_UPLEN(b);
3063  free(pads);
3064 #endif
3065 
3066 #ifdef PADDED_CONTAINER
3067  padded_length = MAX(c->length*2, 10000) - c->length;
3068  c->length += padded_length;
3069 #endif
3070 
3071  if (-1 == cram_write_container(fd, c)) {
3072  cram_free_block(b);
3074  return -1;
3075  }
3076 
3077  // Keep it uncompressed
3078  if (-1 == cram_write_block(fd, b)) {
3079  cram_free_block(b);
3081  return -1;
3082  }
3083 
3084 #ifdef BLANK_BLOCK
3085  if (-1 == cram_write_block(fd, b)) {
3086  cram_free_block(b);
3088  return -1;
3089  }
3090 #endif
3091 
3092  cram_free_block(b);
3094 
3095 #ifdef PADDED_CONTAINER
3096  // Write out padding to allow for in-line SAM header editing
3097  if (NULL == (pads = calloc(1, padded_length)))
3098  return -1;
3099  if (padded_length != hwrite(fd->fp, pads, padded_length))
3100  return -1;
3101  free(pads);
3102 #endif
3103  }
3104 
3105  if (-1 == refs_from_header(fd->refs, fd, fd->header))
3106  return -1;
3107  if (-1 == refs2id(fd->refs, fd->header))
3108  return -1;
3109 
3110  if (0 != hflush(fd->fp))
3111  return -1;
3112 
3113  RP("=== Finishing saving header ===\n");
3114 
3115  return 0;
3116 }
3117 
3118 /* ----------------------------------------------------------------------
3119  * The top-level cram opening, closing and option handling
3120  */
3121 
3122 /*
3123  * Initialises the lookup tables. These could be global statics, but they're
3124  * clumsy to setup in a multi-threaded environment unless we generate
3125  * verbatim code and include that.
3126  */
3127 static void cram_init_tables(cram_fd *fd) {
3128  int i;
3129 
3130  memset(fd->L1, 4, 256);
3131  fd->L1['A'] = 0; fd->L1['a'] = 0;
3132  fd->L1['C'] = 1; fd->L1['c'] = 1;
3133  fd->L1['G'] = 2; fd->L1['g'] = 2;
3134  fd->L1['T'] = 3; fd->L1['t'] = 3;
3135 
3136  memset(fd->L2, 5, 256);
3137  fd->L2['A'] = 0; fd->L2['a'] = 0;
3138  fd->L2['C'] = 1; fd->L2['c'] = 1;
3139  fd->L2['G'] = 2; fd->L2['g'] = 2;
3140  fd->L2['T'] = 3; fd->L2['t'] = 3;
3141  fd->L2['N'] = 4; fd->L2['n'] = 4;
3142 
3143  if (fd->version == CRAM_1_VERS) {
3144  for (i = 0; i < 0x200; i++) {
3145  int f = 0;
3146 
3147  if (i & CRAM_FPAIRED) f |= BAM_FPAIRED;
3148  if (i & CRAM_FPROPER_PAIR) f |= BAM_FPROPER_PAIR;
3149  if (i & CRAM_FUNMAP) f |= BAM_FUNMAP;
3150  if (i & CRAM_FREVERSE) f |= BAM_FREVERSE;
3151  if (i & CRAM_FREAD1) f |= BAM_FREAD1;
3152  if (i & CRAM_FREAD2) f |= BAM_FREAD2;
3153  if (i & CRAM_FSECONDARY) f |= BAM_FSECONDARY;
3154  if (i & CRAM_FQCFAIL) f |= BAM_FQCFAIL;
3155  if (i & CRAM_FDUP) f |= BAM_FDUP;
3156 
3157  fd->bam_flag_swap[i] = f;
3158  }
3159 
3160  for (i = 0; i < 0x1000; i++) {
3161  int g = 0;
3162 
3163  if (i & BAM_FPAIRED) g |= CRAM_FPAIRED;
3164  if (i & BAM_FPROPER_PAIR) g |= CRAM_FPROPER_PAIR;
3165  if (i & BAM_FUNMAP) g |= CRAM_FUNMAP;
3166  if (i & BAM_FREVERSE) g |= CRAM_FREVERSE;
3167  if (i & BAM_FREAD1) g |= CRAM_FREAD1;
3168  if (i & BAM_FREAD2) g |= CRAM_FREAD2;
3169  if (i & BAM_FSECONDARY) g |= CRAM_FSECONDARY;
3170  if (i & BAM_FQCFAIL) g |= CRAM_FQCFAIL;
3171  if (i & BAM_FDUP) g |= CRAM_FDUP;
3172 
3173  fd->cram_flag_swap[i] = g;
3174  }
3175  } else {
3176  /* NOP */
3177  for (i = 0; i < 0x1000; i++)
3178  fd->bam_flag_swap[i] = i;
3179  for (i = 0; i < 0x1000; i++)
3180  fd->cram_flag_swap[i] = i;
3181  }
3182 
3183  memset(fd->cram_sub_matrix, 4, 32*32);
3184  for (i = 0; i < 32; i++) {
3185  fd->cram_sub_matrix[i]['A'&0x1f]=0;
3186  fd->cram_sub_matrix[i]['C'&0x1f]=1;
3187  fd->cram_sub_matrix[i]['G'&0x1f]=2;
3188  fd->cram_sub_matrix[i]['T'&0x1f]=3;
3189  fd->cram_sub_matrix[i]['N'&0x1f]=4;
3190  }
3191  for (i = 0; i < 20; i+=4) {
3192  int j;
3193  for (j = 0; j < 20; j++) {
3194  fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
3195  fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
3196  fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
3197  fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
3198  }
3199  fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+0]&0x1f]=0;
3200  fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+1]&0x1f]=1;
3201  fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+2]&0x1f]=2;
3202  fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+3]&0x1f]=3;
3203  }
3204 }
3205 
3206 // Default version numbers for CRAM
3207 static int major_version = 2;
3208 static int minor_version = 1;
3209 
3210 /*
3211  * Opens a CRAM file for read (mode "rb") or write ("wb").
3212  * The filename may be "-" to indicate stdin or stdout.
3213  *
3214  * Returns file handle on success
3215  * NULL on failure.
3216  */
3217 cram_fd *cram_open(const char *filename, const char *mode) {
3218  cram_FILE *fp;
3219  cram_fd *fd;
3220  char fmode[3]= { mode[0], '\0', '\0' };
3221 
3222  if (strlen(mode) > 1 && (mode[1] == 'b' || mode[1] == 'c')) {
3223  fmode[1] = 'b';
3224  }
3225 
3226 #ifdef SAMTOOLS
3227  fp = hopen(filename, fmode);
3228 #else
3229  if (strcmp(filename, "-") == 0) {
3230  fp = (*fmode == 'r') ? stdin : stdout;
3231  } else {
3232  fp = fopen(filename, fmode);
3233  }
3234 #endif
3235  if (!fp)
3236  return NULL;
3237 
3238  fd = cram_dopen(fp, filename, mode);
3239  if (!fd)
3240  hclose_abruptly(fp);
3241 
3242  return fd;
3243 }
3244 
3245 /* Opens an existing stream for reading or writing.
3246  *
3247  * Returns file handle on success;
3248  * NULL on failure.
3249  *
3250  * cram_FILE is either htslib's hFILE or stdio's FILE, depending on how
3251  * cram_structs.h has been configured.
3252  */
3253 cram_fd *cram_dopen(cram_FILE *fp, const char *filename, const char *mode) {
3254  int i;
3255  char *cp;
3256  cram_fd *fd = calloc(1, sizeof(*fd));
3257  if (!fd)
3258  return NULL;
3259 
3260  fd->level = 5;
3261  if (strlen(mode) > 2 && mode[2] >= '0' && mode[2] <= '9')
3262  fd->level = mode[2] - '0';
3263 
3264  fd->fp = fp;
3265  fd->mode = *mode;
3266  fd->first_container = 0;
3267 
3268  if (fd->mode == 'r') {
3269  /* Reader */
3270 
3271  if (!(fd->file_def = cram_read_file_def(fd)))
3272  goto err;
3273 
3274  fd->version = fd->file_def->major_version * 100 +
3275  fd->file_def->minor_version;
3276 
3277  if (!(fd->header = cram_read_SAM_hdr(fd)))
3278  goto err;
3279 
3280  } else {
3281  /* Writer */
3282  cram_file_def def;
3283 
3284  def.magic[0] = 'C';
3285  def.magic[1] = 'R';
3286  def.magic[2] = 'A';
3287  def.magic[3] = 'M';
3288  def.major_version = major_version;
3289  def.minor_version = minor_version;
3290  memset(def.file_id, 0, 20);
3291  strncpy(def.file_id, filename, 20);
3292  if (0 != cram_write_file_def(fd, &def))
3293  goto err;
3294 
3295  fd->version = def.major_version * 100 + def.minor_version;
3296 
3297  /* SAM header written later */
3298  }
3299 
3300  cram_init_tables(fd);
3301 
3302  fd->prefix = strdup((cp = strrchr(filename, '/')) ? cp+1 : filename);
3303  if (!fd->prefix)
3304  goto err;
3305  fd->slice_num = 0;
3306  fd->first_base = fd->last_base = -1;
3307  fd->record_counter = 0;
3308 
3309  fd->ctr = NULL;
3310  fd->refs = refs_create();
3311  if (!fd->refs)
3312  goto err;
3313  fd->ref_id = -2;
3314  fd->ref = NULL;
3315 
3316  fd->decode_md = 0;
3317  fd->verbose = 0;
3320  fd->embed_ref = 0;
3321  fd->no_ref = 0;
3322  fd->ignore_md5 = 0;
3323  fd->use_bz2 = 0;
3324  fd->multi_seq = 0;
3325  fd->unsorted = 0;
3326  fd->shared_ref = 0;
3327 
3328  fd->index = NULL;
3329  fd->own_pool = 0;
3330  fd->pool = NULL;
3331  fd->rqueue = NULL;
3332  fd->job_pending = NULL;
3333  fd->ooc = 0;
3334 
3335  for (i = 0; i < 7; i++)
3336  fd->m[i] = cram_new_metrics();
3337 
3338  fd->range.refid = -2; // no ref.
3339  fd->eof = 1; // See samtools issue #150
3340  fd->ref_fn = NULL;
3341 
3342  fd->bl = NULL;
3343 
3344  /* Initialise dummy refs from the @SQ headers */
3345  if (-1 == refs_from_header(fd->refs, fd, fd->header))
3346  goto err;
3347 
3348  return fd;
3349 
3350  err:
3351  if (fd)
3352  free(fd);
3353 
3354  return NULL;
3355 }
3356 
3357 /*
3358  * Seek within a CRAM file.
3359  *
3360  * Returns 0 on success
3361  * -1 on failure
3362  */
3363 int cram_seek(cram_fd *fd, off_t offset, int whence) {
3364  char buf[65536];
3365 
3366  if (hseek(fd->fp, offset, whence) >= 0)
3367  return 0;
3368 
3369  if (!(whence == SEEK_CUR && offset >= 0))
3370  return -1;
3371 
3372  /* Couldn't fseek, but we're in SEEK_CUR mode so read instead */
3373  while (offset > 0) {
3374  int len = MIN(65536, offset);
3375  if (len != hread(fd->fp, buf, len))
3376  return -1;
3377  offset -= len;
3378  }
3379 
3380  return 0;
3381 }
3382 
3383 /*
3384  * Flushes a CRAM file.
3385  * Useful for when writing to stdout without wishing to close the stream.
3386  *
3387  * Returns 0 on success
3388  * -1 on failure
3389  */
3391  if (!fd)
3392  return -1;
3393 
3394  if (fd->mode == 'w' && fd->ctr) {
3395  if(fd->ctr->slice)
3396  fd->ctr->curr_slice++;
3397  if (-1 == cram_flush_container_mt(fd, fd->ctr))
3398  return -1;
3399  }
3400 
3401  return 0;
3402 }
3403 
3404 /*
3405  * Closes a CRAM file.
3406  * Returns 0 on success
3407  * -1 on failure
3408  */
3410  spare_bams *bl, *next;
3411  int i;
3412 
3413  if (!fd)
3414  return -1;
3415 
3416  if (fd->mode == 'w' && fd->ctr) {
3417  if(fd->ctr->slice)
3418  fd->ctr->curr_slice++;
3419  if (-1 == cram_flush_container_mt(fd, fd->ctr))
3420  return -1;
3421  }
3422 
3423  if (fd->pool) {
3424  t_pool_flush(fd->pool);
3425 
3426  if (0 != cram_flush_result(fd))
3427  return -1;
3428 
3429  pthread_mutex_destroy(&fd->metrics_lock);
3430  pthread_mutex_destroy(&fd->ref_lock);
3431  pthread_mutex_destroy(&fd->bam_list_lock);
3432 
3433  fd->ctr = NULL; // prevent double freeing
3434 
3435  //fprintf(stderr, "CRAM: destroy queue %p\n", fd->rqueue);
3436 
3438  }
3439 
3440  if (fd->mode == 'w') {
3441  /* Write EOF block */
3442  if (30 != hwrite(fd->fp, "\x0b\x00\x00\x00\xff\xff\xff\xff"
3443  "\xff\xe0\x45\x4f\x46\x00\x00\x00"
3444  "\x00\x01\x00\x00\x01\x00\x06\x06"
3445  "\x01\x00\x01\x00\x01\x00", 30))
3446  return -1;
3447 
3448 // if (1 != fwrite("\x00\x00\x00\x00\xff\xff\xff\xff"
3449 // "\xff\xe0\x45\x4f\x46\x00\x00\x00"
3450 // "\x00\x00\x00", 19, 1, fd->fp))
3451 // return -1;
3452  }
3453 
3454  for (bl = fd->bl; bl; bl = next) {
3455  int i, max_rec = fd->seqs_per_slice * fd->slices_per_container;
3456 
3457  next = bl->next;
3458  for (i = 0; i < max_rec; i++) {
3459  if (bl->bams[i])
3460  bam_free(bl->bams[i]);
3461  }
3462  free(bl->bams);
3463  free(bl);
3464  }
3465 
3466  if (paranoid_hclose(fd->fp) != 0)
3467  return -1;
3468 
3469  if (fd->file_def)
3471 
3472  if (fd->header)
3473  sam_hdr_free(fd->header);
3474 
3475  free(fd->prefix);
3476 
3477  if (fd->ctr)
3478  cram_free_container(fd->ctr);
3479 
3480  if (fd->refs)
3481  refs_free(fd->refs);
3482  if (fd->ref_free)
3483  free(fd->ref_free);
3484 
3485  for (i = 0; i < 7; i++)
3486  if (fd->m[i])
3487  free(fd->m[i]);
3488 
3489  if (fd->index)
3490  cram_index_free(fd);
3491 
3492  if (fd->own_pool && fd->pool)
3493  t_pool_destroy(fd->pool, 0);
3494 
3495  free(fd);
3496  return 0;
3497 }
3498 
3499 /*
3500  * Returns 1 if we hit an EOF while reading.
3501  */
3502 int cram_eof(cram_fd *fd) {
3503  return fd->eof;
3504 }
3505 
3506 
3507 /*
3508  * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
3509  * Use this immediately after opening.
3510  *
3511  * Returns 0 on success
3512  * -1 on failure
3513  */
3514 int cram_set_option(cram_fd *fd, enum cram_option opt, ...) {
3515  int r;
3516  va_list args;
3517 
3518  va_start(args, opt);
3519  r = cram_set_voption(fd, opt, args);
3520  va_end(args);
3521 
3522  return r;
3523 }
3524 
3525 /*
3526  * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
3527  * Use this immediately after opening.
3528  *
3529  * Returns 0 on success
3530  * -1 on failure
3531  */
3532 int cram_set_voption(cram_fd *fd, enum cram_option opt, va_list args) {
3533  refs_t *refs;
3534 
3535  switch (opt) {
3536  case CRAM_OPT_DECODE_MD:
3537  fd->decode_md = va_arg(args, int);
3538  break;
3539 
3540  case CRAM_OPT_PREFIX:
3541  if (fd->prefix)
3542  free(fd->prefix);
3543  if (!(fd->prefix = strdup(va_arg(args, char *))))
3544  return -1;
3545  break;
3546 
3547  case CRAM_OPT_VERBOSITY:
3548  fd->verbose = va_arg(args, int);
3549  break;
3550 
3552  fd->seqs_per_slice = va_arg(args, int);
3553  break;
3554 
3556  fd->slices_per_container = va_arg(args, int);
3557  break;
3558 
3559  case CRAM_OPT_EMBED_REF:
3560  fd->embed_ref = va_arg(args, int);
3561  break;
3562 
3563  case CRAM_OPT_NO_REF:
3564  fd->no_ref = va_arg(args, int);
3565  break;
3566 
3567  case CRAM_OPT_IGNORE_MD5:
3568  fd->ignore_md5 = va_arg(args, int);
3569  break;
3570 
3571  case CRAM_OPT_USE_BZIP2:
3572  fd->use_bz2 = va_arg(args, int);
3573  break;
3574 
3575  case CRAM_OPT_SHARED_REF:
3576  fd->shared_ref = 1;
3577  refs = va_arg(args, refs_t *);
3578  if (refs != fd->refs) {
3579  if (fd->refs)
3580  refs_free(fd->refs);
3581  fd->refs = refs;
3582  fd->refs->count++;
3583  }
3584  break;
3585 
3586  case CRAM_OPT_RANGE:
3587  fd->range = *va_arg(args, cram_range *);
3588  return cram_seek_to_refpos(fd, &fd->range);
3589 
3590  case CRAM_OPT_REFERENCE:
3591  return cram_load_reference(fd, va_arg(args, char *));
3592 
3593  case CRAM_OPT_VERSION: {
3594  int major, minor;
3595  char *s = va_arg(args, char *);
3596  if (2 != sscanf(s, "%d.%d", &major, &minor)) {
3597  fprintf(stderr, "Malformed version string %s\n", s);
3598  return -1;
3599  }
3600  if (!((major == 1 && minor == 0) ||
3601  (major == 2 && (minor == 0 || minor == 1)) ||
3602  (major == 3 && minor == 0))) {
3603  fprintf(stderr, "Unknown version string; "
3604  "use 1.0, 2.0, 2.1 or 3.0\n");
3605  return -1;
3606  }
3607  break;
3608  }
3609 
3611  fd->multi_seq = va_arg(args, int);
3612  break;
3613 
3614  case CRAM_OPT_NTHREADS: {
3615  int nthreads = va_arg(args, int);
3616  if (nthreads > 1) {
3617  if (!(fd->pool = t_pool_init(nthreads*2, nthreads)))
3618  return -1;
3619 
3620  fd->rqueue = t_results_queue_init();
3621  pthread_mutex_init(&fd->metrics_lock, NULL);
3622  pthread_mutex_init(&fd->ref_lock, NULL);
3623  pthread_mutex_init(&fd->bam_list_lock, NULL);
3624  fd->shared_ref = 1;
3625  fd->own_pool = 1;
3626  }
3627  break;
3628  }
3629 
3630  case CRAM_OPT_THREAD_POOL:
3631  fd->pool = va_arg(args, t_pool *);
3632  if (fd->pool) {
3633  fd->rqueue = t_results_queue_init();
3634  pthread_mutex_init(&fd->metrics_lock, NULL);
3635  pthread_mutex_init(&fd->ref_lock, NULL);
3636  pthread_mutex_init(&fd->bam_list_lock, NULL);
3637  }
3638  fd->shared_ref = 1; // Needed to avoid clobbering ref between threads
3639  fd->own_pool = 0;
3640 
3641  //fd->qsize = 1;
3642  //fd->decoded = calloc(fd->qsize, sizeof(cram_container *));
3643  //t_pool_dispatch(fd->pool, cram_decoder_thread, fd);
3644  break;
3645 
3646  default:
3647  fprintf(stderr, "Unknown CRAM option code %d\n", opt);
3648  return -1;
3649  }
3650 
3651  return 0;
3652 }