NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
cram_decode.c
Go to the documentation of this file.
1 /*
2 Copyright (c) 2012-2013 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4 
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7 
8  1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10 
11  2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14 
15  3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18 
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30 
31 /*
32  * - In-memory decoding of CRAM data structures.
33  * - Iterator for reading CRAM record by record.
34  */
35 
36 #ifdef HAVE_CONFIG_H
37 #include "io_lib_config.h"
38 #endif
39 
40 #include <stdio.h>
41 #include <errno.h>
42 #include <assert.h>
43 #include <stdlib.h>
44 #include <string.h>
45 #include <zlib.h>
46 #include <sys/types.h>
47 #include <sys/stat.h>
48 #include <math.h>
49 #include <ctype.h>
50 
51 #include "cram/cram.h"
52 #include "cram/os.h"
53 #include "cram/md5.h"
54 
55 //Whether CIGAR has just M or uses = and X to indicate match and mismatch
56 //#define USE_X
57 
58 /* ----------------------------------------------------------------------
59  * CRAM compression headers
60  */
61 
62 /*
63  * Decodes the Tag Dictionary record in the preservation map
64  * Updates the cram compression header.
65  *
66  * Returns number of bytes decoded on success
67  * -1 on failure
68  */
70  char *op = cp;
71  unsigned char *dat;
72  cram_block *b;
73  int32_t blk_size;
74  int nTL, i, sz;
75 
76  if (!(b = cram_new_block(0, 0)))
77  return -1;
78  h->TD_blk = b;
79 
80  /* Decode */
81  cp += itf8_get(cp, &blk_size);
82  if (!blk_size) {
83  h->nTL = 0;
84  h->TL = NULL;
85  cram_free_block(b);
86  return cp - op;
87  }
88 
89  BLOCK_APPEND(b, cp, blk_size);
90  cp += blk_size;
91  sz = cp - op;
92 
93  // Force nul termination if missing
94  if (BLOCK_DATA(b)[BLOCK_SIZE(b)-1])
95  BLOCK_APPEND_CHAR(b, '\0');
96 
97  /* Set up TL lookup table */
98  dat = BLOCK_DATA(b);
99 
100  // Count
101  for (nTL = i = 0; i < BLOCK_SIZE(b); i++) {
102  nTL++;
103  while (dat[i])
104  i++;
105  }
106 
107  // Copy
108  h->nTL = nTL;
109  if (!(h->TL = calloc(h->nTL, sizeof(unsigned char *))))
110  return -1;
111  for (nTL = i = 0; i < BLOCK_SIZE(b); i++) {
112  h->TL[nTL++] = &dat[i];
113  while (dat[i])
114  i++;
115  }
116 
117  return sz;
118 }
119 
120 /*
121  * Decodes a CRAM block compression header.
122  * Returns header ptr on success
123  * NULL on failure
124  */
126  cram_block *b) {
127  char *cp, *cp_copy;
128  cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr));
129  int i;
130  int32_t map_size, map_count;
131 
132  if (!hdr)
133  return NULL;
134 
135  if (b->method != RAW) {
136  if (cram_uncompress_block(b))
137  return NULL;
138  }
139 
140  cp = (char *)b->data;
141 
142  if (fd->version == CRAM_1_VERS) {
143  cp += itf8_get(cp, &hdr->ref_seq_id);
144  cp += itf8_get(cp, &hdr->ref_seq_start);
145  cp += itf8_get(cp, &hdr->ref_seq_span);
146  cp += itf8_get(cp, &hdr->num_records);
147  cp += itf8_get(cp, &hdr->num_landmarks);
148  if (!(hdr->landmark = malloc(hdr->num_landmarks * sizeof(int32_t)))) {
149  free(hdr);
150  return NULL;
151  }
152  for (i = 0; i < hdr->num_landmarks; i++) {
153  cp += itf8_get(cp, &hdr->landmark[i]);
154  }
155  }
156 
157  hdr->preservation_map = kh_init(map);
158 
159  memset(hdr->rec_encoding_map, 0,
160  CRAM_MAP_HASH * sizeof(hdr->rec_encoding_map[0]));
161  memset(hdr->tag_encoding_map, 0,
162  CRAM_MAP_HASH * sizeof(hdr->tag_encoding_map[0]));
163 
164  if (!hdr->preservation_map) {
166  return NULL;
167  }
168 
169  /* Initialise defaults for preservation map */
170  hdr->mapped_qs_included = 0;
171  hdr->unmapped_qs_included = 0;
172  hdr->unmapped_placed = 0;
173  hdr->qs_included = 0;
174  hdr->read_names_included = 0;
175  hdr->AP_delta = 1;
176  memcpy(hdr->substitution_matrix, "CGTNAGTNACTNACGNACGT", 20);
177 
178  /* Preservation map */
179  cp += itf8_get(cp, &map_size); cp_copy = cp;
180  cp += itf8_get(cp, &map_count);
181  for (i = 0; i < map_count; i++) {
182  pmap_t hd;
183  khint_t k;
184  int r;
185 
186  cp += 2;
187  switch(CRAM_KEY(cp[-2],cp[-1])) {
188  case CRAM_KEY('M','I'):
189  hd.i = *cp++;
190  k = kh_put(map, hdr->preservation_map, "MI", &r);
191  if (-1 == r) {
193  return NULL;
194  }
195 
196  kh_val(hdr->preservation_map, k) = hd;
197  hdr->mapped_qs_included = hd.i;
198  break;
199 
200  case CRAM_KEY('U','I'):
201  hd.i = *cp++;
202  k = kh_put(map, hdr->preservation_map, "UI", &r);
203  if (-1 == r) {
205  return NULL;
206  }
207 
208  kh_val(hdr->preservation_map, k) = hd;
209  hdr->unmapped_qs_included = hd.i;
210  break;
211 
212  case CRAM_KEY('P','I'):
213  hd.i = *cp++;
214  k = kh_put(map, hdr->preservation_map, "PI", &r);
215  if (-1 == r) {
217  return NULL;
218  }
219 
220  kh_val(hdr->preservation_map, k) = hd;
221  hdr->unmapped_placed = hd.i;
222  break;
223 
224  case CRAM_KEY('R','N'):
225  hd.i = *cp++;
226  k = kh_put(map, hdr->preservation_map, "RN", &r);
227  if (-1 == r) {
229  return NULL;
230  }
231 
232  kh_val(hdr->preservation_map, k) = hd;
233  hdr->read_names_included = hd.i;
234  break;
235 
236  case CRAM_KEY('A','P'):
237  hd.i = *cp++;
238  k = kh_put(map, hdr->preservation_map, "AP", &r);
239  if (-1 == r) {
241  return NULL;
242  }
243 
244  kh_val(hdr->preservation_map, k) = hd;
245  hdr->AP_delta = hd.i;
246  break;
247 
248  case CRAM_KEY('R','R'):
249  hd.i = *cp++;
250  k = kh_put(map, hdr->preservation_map, "RR", &r);
251  if (-1 == r) {
253  return NULL;
254  }
255 
256  kh_val(hdr->preservation_map, k) = hd;
257  fd->no_ref = !hd.i;
258  break;
259 
260  case CRAM_KEY('S','M'):
261  hdr->substitution_matrix[0][(cp[0]>>6)&3] = 'C';
262  hdr->substitution_matrix[0][(cp[0]>>4)&3] = 'G';
263  hdr->substitution_matrix[0][(cp[0]>>2)&3] = 'T';
264  hdr->substitution_matrix[0][(cp[0]>>0)&3] = 'N';
265 
266  hdr->substitution_matrix[1][(cp[1]>>6)&3] = 'A';
267  hdr->substitution_matrix[1][(cp[1]>>4)&3] = 'G';
268  hdr->substitution_matrix[1][(cp[1]>>2)&3] = 'T';
269  hdr->substitution_matrix[1][(cp[1]>>0)&3] = 'N';
270 
271  hdr->substitution_matrix[2][(cp[2]>>6)&3] = 'A';
272  hdr->substitution_matrix[2][(cp[2]>>4)&3] = 'C';
273  hdr->substitution_matrix[2][(cp[2]>>2)&3] = 'T';
274  hdr->substitution_matrix[2][(cp[2]>>0)&3] = 'N';
275 
276  hdr->substitution_matrix[3][(cp[3]>>6)&3] = 'A';
277  hdr->substitution_matrix[3][(cp[3]>>4)&3] = 'C';
278  hdr->substitution_matrix[3][(cp[3]>>2)&3] = 'G';
279  hdr->substitution_matrix[3][(cp[3]>>0)&3] = 'N';
280 
281  hdr->substitution_matrix[4][(cp[4]>>6)&3] = 'A';
282  hdr->substitution_matrix[4][(cp[4]>>4)&3] = 'C';
283  hdr->substitution_matrix[4][(cp[4]>>2)&3] = 'G';
284  hdr->substitution_matrix[4][(cp[4]>>0)&3] = 'T';
285 
286  hd.p = cp;
287  cp += 5;
288 
289  k = kh_put(map, hdr->preservation_map, "SM", &r);
290  if (-1 == r) {
292  return NULL;
293  }
294  kh_val(hdr->preservation_map, k) = hd;
295  break;
296 
297  case CRAM_KEY('T','D'): {
298  int sz = cram_decode_TD(cp, hdr); // tag dictionary
299  if (sz < 0) {
301  return NULL;
302  }
303 
304  hd.p = cp;
305  cp += sz;
306 
307  k = kh_put(map, hdr->preservation_map, "TD", &r);
308  if (-1 == r) {
310  return NULL;
311  }
312  kh_val(hdr->preservation_map, k) = hd;
313  break;
314  }
315 
316  default:
317  fprintf(stderr, "Unrecognised preservation map key %c%c\n",
318  cp[-2], cp[-1]);
319  // guess byte;
320  cp++;
321  break;
322  }
323  }
324  if (cp - cp_copy != map_size) {
326  return NULL;
327  }
328 
329  /* Record encoding map */
330  cp += itf8_get(cp, &map_size); cp_copy = cp;
331  cp += itf8_get(cp, &map_count);
332  for (i = 0; i < map_count; i++) {
333  char *key = cp;
334  int32_t encoding;
335  int32_t size;
336  cram_map *m = malloc(sizeof(*m)); // FIXME: use pooled_alloc
337 
338  if (!m) {
340  return NULL;
341  }
342 
343  cp += 2;
344  cp += itf8_get(cp, &encoding);
345  cp += itf8_get(cp, &size);
346 
347  // Fill out cram_map purely for cram_dump to dump out.
348  m->key = (key[0]<<8)|key[1];
349  m->encoding = encoding;
350  m->size = size;
351  m->offset = cp - (char *)b->data;
352  m->codec = NULL;
353 
354  if (m->encoding == E_NULL)
355  continue;
356 
357  //printf("%s codes for %.2s\n", cram_encoding2str(encoding), key);
358 
359  /*
360  * For CRAM1.0 CF and BF are Byte and not Int.
361  * Practically speaking it makes no difference unless we have a
362  * 1.0 format file that stores these in EXTERNAL as only then
363  * does Byte vs Int matter.
364  *
365  * Neither this C code nor Java reference implementations did this,
366  * so we gloss over it and treat them as int.
367  */
368 
369  if (key[0] == 'B' && key[1] == 'F') {
370  if (!(hdr->BF_codec = cram_decoder_init(encoding, cp, size, E_INT,
371  fd->version))) {
373  return NULL;
374  }
375  } else if (key[0] == 'C' && key[1] == 'F') {
376  if (!(hdr->CF_codec = cram_decoder_init(encoding, cp, size, E_INT,
377  fd->version))) {
379  return NULL;
380  }
381  } else if (key[0] == 'R' && key[1] == 'I') {
382  if (!(hdr->RI_codec = cram_decoder_init(encoding, cp, size, E_INT,
383  fd->version))) {
385  return NULL;
386  }
387  } else if (key[0] == 'R' && key[1] == 'L') {
388  if (!(hdr->RL_codec = cram_decoder_init(encoding, cp, size, E_INT,
389  fd->version))) {
391  return NULL;
392  }
393  } else if (key[0] == 'A' && key[1] == 'P') {
394  if (!(hdr->AP_codec = cram_decoder_init(encoding, cp, size, E_INT,
395  fd->version))) {
397  return NULL;
398  }
399  } else if (key[0] == 'R' && key[1] == 'G') {
400  if (!(hdr->RG_codec = cram_decoder_init(encoding, cp, size, E_INT,
401  fd->version))) {
403  return NULL;
404  }
405  } else if (key[0] == 'M' && key[1] == 'F') {
406  if (!(hdr->MF_codec = cram_decoder_init(encoding, cp, size, E_INT,
407  fd->version))) {
409  return NULL;
410  }
411  } else if (key[0] == 'N' && key[1] == 'S') {
412  if (!(hdr->NS_codec = cram_decoder_init(encoding, cp, size, E_INT,
413  fd->version))) {
415  return NULL;
416  }
417  } else if (key[0] == 'N' && key[1] == 'P') {
418  if (!(hdr->NP_codec = cram_decoder_init(encoding, cp, size, E_INT,
419  fd->version))) {
421  return NULL;
422  }
423  } else if (key[0] == 'T' && key[1] == 'S') {
424  if (!(hdr->TS_codec = cram_decoder_init(encoding, cp, size, E_INT,
425  fd->version))) {
427  return NULL;
428  }
429  } else if (key[0] == 'N' && key[1] == 'F') {
430  if (!(hdr->NF_codec = cram_decoder_init(encoding, cp, size, E_INT,
431  fd->version))) {
433  return NULL;
434  }
435  } else if (key[0] == 'T' && key[1] == 'C') {
436  if (!(hdr->TC_codec = cram_decoder_init(encoding, cp, size, E_BYTE,
437  fd->version))) {
439  return NULL;
440  }
441  } else if (key[0] == 'T' && key[1] == 'N') {
442  if (!(hdr->TN_codec = cram_decoder_init(encoding, cp, size, E_INT,
443  fd->version))) {
445  return NULL;
446  }
447  } else if (key[0] == 'F' && key[1] == 'N') {
448  if (!(hdr->FN_codec = cram_decoder_init(encoding, cp, size, E_INT,
449  fd->version))) {
451  return NULL;
452  }
453  } else if (key[0] == 'F' && key[1] == 'C') {
454  if (!(hdr->FC_codec = cram_decoder_init(encoding, cp, size, E_BYTE,
455  fd->version))) {
457  return NULL;
458  }
459  } else if (key[0] == 'F' && key[1] == 'P') {
460  if (!(hdr->FP_codec = cram_decoder_init(encoding, cp, size, E_INT,
461  fd->version))) {
463  return NULL;
464  }
465  } else if (key[0] == 'B' && key[1] == 'S') {
466  if (!(hdr->BS_codec = cram_decoder_init(encoding, cp, size, E_BYTE,
467  fd->version))) {
469  return NULL;
470  }
471  } else if (key[0] == 'I' && key[1] == 'N') {
472  if (!(hdr->IN_codec = cram_decoder_init(encoding, cp, size,
473  E_BYTE_ARRAY,
474  fd->version))) {
476  return NULL;
477  }
478  } else if (key[0] == 'S' && key[1] == 'C') {
479  if (!(hdr->SC_codec = cram_decoder_init(encoding, cp, size,
480  E_BYTE_ARRAY,
481  fd->version))) {
483  return NULL;
484  }
485  } else if (key[0] == 'D' && key[1] == 'L') {
486  if (!(hdr->DL_codec = cram_decoder_init(encoding, cp, size, E_INT,
487  fd->version))) {
489  return NULL;
490  }
491  } else if (key[0] == 'B' && key[1] == 'A') {
492  if (!(hdr->BA_codec = cram_decoder_init(encoding, cp, size, E_BYTE,
493  fd->version))) {
495  return NULL;
496  }
497  } else if (key[0] == 'R' && key[1] == 'S') {
498  if (!(hdr->RS_codec = cram_decoder_init(encoding, cp, size, E_INT,
499  fd->version))) {
501  return NULL;
502  }
503  } else if (key[0] == 'P' && key[1] == 'D') {
504  if (!(hdr->PD_codec = cram_decoder_init(encoding, cp, size, E_INT,
505  fd->version))) {
507  return NULL;
508  }
509  } else if (key[0] == 'H' && key[1] == 'C') {
510  if (!(hdr->HC_codec = cram_decoder_init(encoding, cp, size, E_INT,
511  fd->version))) {
513  return NULL;
514  }
515  } else if (key[0] == 'M' && key[1] == 'Q') {
516  if (!(hdr->MQ_codec = cram_decoder_init(encoding, cp, size, E_INT,
517  fd->version))) {
519  return NULL;
520  }
521  } else if (key[0] == 'R' && key[1] == 'N') {
522  if (!(hdr->RN_codec = cram_decoder_init(encoding, cp, size,
524  fd->version))) {
526  return NULL;
527  }
528  } else if (key[0] == 'Q' && key[1] == 'S') {
529  if (!(hdr->QS_codec = cram_decoder_init(encoding, cp, size, E_BYTE,
530  fd->version))) {
532  return NULL;
533  }
534  if (!(hdr->Qs_codec = cram_decoder_init(encoding, cp, size,
535  E_BYTE_ARRAY,
536  fd->version))) {
538  return NULL;
539  }
540  } else if (key[0] == 'T' && key[1] == 'L') {
541  if (!(hdr->TL_codec = cram_decoder_init(encoding, cp, size, E_INT,
542  fd->version))) {
544  return NULL;
545  }
546  } else if (key[0] == 'T' && key[1] == 'M') {
547  } else if (key[0] == 'T' && key[1] == 'V') {
548  } else
549  fprintf(stderr, "Unrecognised key: %.2s\n", key);
550 
551  cp += size;
552 
553  m->next = hdr->rec_encoding_map[CRAM_MAP(key[0], key[1])];
554  hdr->rec_encoding_map[CRAM_MAP(key[0], key[1])] = m;
555  }
556  if (cp - cp_copy != map_size) {
558  return NULL;
559  }
560 
561  /* Tag encoding map */
562  cp += itf8_get(cp, &map_size); cp_copy = cp;
563  cp += itf8_get(cp, &map_count);
564  for (i = 0; i < map_count; i++) {
565  int32_t encoding;
566  int32_t size;
567  cram_map *m = malloc(sizeof(*m)); // FIXME: use pooled_alloc
568  char *key = cp+1;
569 
570  if (!m) {
572  return NULL;
573  }
574 
575  m->key = (key[0]<<16)|(key[1]<<8)|key[2];
576 
577  cp += 4; // Strictly ITF8, but this suffices
578  cp += itf8_get(cp, &encoding);
579  cp += itf8_get(cp, &size);
580 
581  m->encoding = encoding;
582  m->size = size;
583  m->offset = cp - (char *)b->data;
584  if (!(m->codec = cram_decoder_init(encoding, cp, size,
585  E_BYTE_ARRAY_BLOCK, fd->version))) {
587  free(m);
588  return NULL;
589  }
590 
591  cp += size;
592 
593  m->next = hdr->tag_encoding_map[CRAM_MAP(key[0],key[1])];
594  hdr->tag_encoding_map[CRAM_MAP(key[0],key[1])] = m;
595  }
596  if (cp - cp_copy != map_size) {
598  return NULL;
599  }
600 
601  return hdr;
602 }
603 
604 /* ----------------------------------------------------------------------
605  * CRAM slices
606  */
607 
608 /*
609  * Decodes a CRAM (un)mapped slice header block.
610  * Returns slice header ptr on success
611  * NULL on failure
612  */
615  char *cp = (char *)b->data;
616  int i;
617 
618  if (b->content_type != MAPPED_SLICE &&
620  return NULL;
621 
622  if (!(hdr = calloc(1, sizeof(*hdr))))
623  return NULL;
624 
625  hdr->content_type = b->content_type;
626 
627  if (b->content_type == MAPPED_SLICE) {
628  cp += itf8_get(cp, &hdr->ref_seq_id);
629  cp += itf8_get(cp, &hdr->ref_seq_start);
630  cp += itf8_get(cp, &hdr->ref_seq_span);
631  }
632  cp += itf8_get(cp, &hdr->num_records);
633  if (fd->version != CRAM_1_VERS)
634  cp += itf8_get(cp, &hdr->record_counter);
635  cp += itf8_get(cp, &hdr->num_blocks);
636 
637  cp += itf8_get(cp, &hdr->num_content_ids);
638  hdr->block_content_ids = malloc(hdr->num_content_ids * sizeof(int32_t));
639  if (!hdr->block_content_ids) {
640  free(hdr);
641  return NULL;
642  }
643 
644  for (i = 0; i < hdr->num_content_ids; i++) {
645  cp += itf8_get(cp, &hdr->block_content_ids[i]);
646  }
647 
648  if (b->content_type == MAPPED_SLICE) {
649  cp += itf8_get(cp, &hdr->ref_base_id);
650  }
651 
652  if (fd->version != CRAM_1_VERS) {
653  memcpy(hdr->md5, cp, 16);
654  } else {
655  memset(hdr->md5, 0, 16);
656  }
657 
658  return hdr;
659 }
660 
661 
662 #if 0
663 /* Returns the number of bits set in val; it the highest bit used */
664 static int nbits(int v) {
665  static const int MultiplyDeBruijnBitPosition[32] = {
666  1, 10, 2, 11, 14, 22, 3, 30, 12, 15, 17, 19, 23, 26, 4, 31,
667  9, 13, 21, 29, 16, 18, 25, 8, 20, 28, 24, 7, 27, 6, 5, 32
668  };
669 
670  v |= v >> 1; // first up to set all bits 1 after the first 1 */
671  v |= v >> 2;
672  v |= v >> 4;
673  v |= v >> 8;
674  v |= v >> 16;
675 
676  // DeBruijn magic to find top bit
677  return MultiplyDeBruijnBitPosition[(uint32_t)(v * 0x07C4ACDDU) >> 27];
678 }
679 #endif
680 
681 #if 0
682 static int sort_freqs(const void *vp1, const void *vp2) {
683  const int i1 = *(const int *)vp1;
684  const int i2 = *(const int *)vp2;
685  return i1-i2;
686 }
687 #endif
688 
689 /* ----------------------------------------------------------------------
690  * Primary CRAM sequence decoder
691  */
692 
693 /*
694  * Internal part of cram_decode_slice().
695  * Generates the sequence, quality and cigar components.
696  */
697 static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
698  cram_block *blk, cram_record *cr, SAM_hdr *bfd,
699  int cf, char *seq, char *qual) {
700  int prev_pos = 0, f, r = 0, out_sz = 1;
701  int seq_pos = 1;
702  int cig_len = 0, ref_pos = cr->apos;
703  int32_t fn, i32;
704  enum cigar_op cig_op = BAM_CMATCH;
705  uint32_t *cigar = s->cigar;
706  uint32_t ncigar = s->ncigar;
707  uint32_t cigar_alloc = s->cigar_alloc;
708  uint32_t nm = 0, md_dist = 0;
709  int orig_aux = 0;
710  int decode_md = fd->decode_md;
711  char buf[20];
712 
713  if (!(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
714  memset(qual, 30, cr->len);
715  }
716 
717  if (decode_md) {
718  orig_aux = BLOCK_SIZE(s->aux_blk);
719  BLOCK_APPEND(s->aux_blk, "MDZ", 3);
720  }
721 
722  if (!c->comp_hdr->FN_codec) return -1;
723  r |= c->comp_hdr->FN_codec->decode(s,c->comp_hdr->FN_codec, blk,
724  (char *)&fn, &out_sz);
725 
726  ref_pos--; // count from 0
727  cr->cigar = ncigar;
728  for (f = 0; f < fn; f++) {
729  int32_t pos;
730  char op;
731 
732  if (ncigar+2 >= cigar_alloc) {
733  cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
734  s->cigar = cigar;
735  if (!(cigar = realloc(cigar, cigar_alloc * sizeof(*cigar))))
736  return -1;
737  }
738 
739  if (!c->comp_hdr->FC_codec) return -1;
740  r |= c->comp_hdr->FC_codec->decode(s, c->comp_hdr->FC_codec, blk,
741  &op, &out_sz);
742  if (!c->comp_hdr->FP_codec) return -1;
743  r |= c->comp_hdr->FP_codec->decode(s, c->comp_hdr->FP_codec, blk,
744  (char *)&pos, &out_sz);
745  pos += prev_pos;
746 
747  if (pos > seq_pos) {
748  if (pos > cr->len+1)
749  return -1;
750 
751  if (s->ref && cr->ref_id >= 0) {
752  if (ref_pos + pos - seq_pos > bfd->ref[cr->ref_id].len) {
753  static int whinged = 0;
754  if (!whinged)
755  fprintf(stderr, "Ref pos outside of ref "
756  "sequence boundary\n");
757  whinged = 1;
758  } else {
759  memcpy(&seq[seq_pos-1], &s->ref[ref_pos - s->ref_start +1],
760  pos - seq_pos);
761  }
762  }
763 #ifdef USE_X
764  if (cig_len && cig_op != BAM_CBASE_MATCH) {
765  cigar[ncigar++] = (cig_len<<4) + cig_op;
766  cig_len = 0;
767  }
768  cig_op = BAM_CBASE_MATCH;
769 #else
770  if (cig_len && cig_op != BAM_CMATCH) {
771  cigar[ncigar++] = (cig_len<<4) + cig_op;
772  cig_len = 0;
773  }
774  cig_op = BAM_CMATCH;
775 #endif
776  cig_len += pos - seq_pos;
777  ref_pos += pos - seq_pos;
778  md_dist += pos - seq_pos;
779  seq_pos = pos;
780  }
781 
782  prev_pos = pos;
783 
784  switch(op) {
785  case 'S': { // soft clip: IN
786  int32_t out_sz2 = 1;
787 
788  if (cig_len) {
789  cigar[ncigar++] = (cig_len<<4) + cig_op;
790  cig_len = 0;
791  }
792  if (fd->version == CRAM_1_VERS) {
793  r |= c->comp_hdr->IN_codec
794  ? c->comp_hdr->IN_codec->decode(s, c->comp_hdr->IN_codec,
795  blk, &seq[pos-1], &out_sz2)
796  : (seq[pos-1] = 'N', out_sz2 = 1, 0);
797  } else {
798  r |= c->comp_hdr->SC_codec
799  ? c->comp_hdr->SC_codec->decode(s, c->comp_hdr->SC_codec,
800  blk, &seq[pos-1], &out_sz2)
801  : (seq[pos-1] = 'N', out_sz2 = 1, 0);
802  }
803  cigar[ncigar++] = (out_sz2<<4) + BAM_CSOFT_CLIP;
804  cig_op = BAM_CSOFT_CLIP;
805  seq_pos += out_sz2;
806  break;
807  }
808 
809  case 'X': { // Substitution; BS
810  unsigned char base;
811 #ifdef USE_X
812  if (cig_len && cig_op != BAM_CBASE_MISMATCH) {
813  cigar[ncigar++] = (cig_len<<4) + cig_op;
814  cig_len = 0;
815  }
816  if (!c->comp_hdr->BS_codec) return -1;
817  r |= c->comp_hdr->BS_codec->decode(s, c->comp_hdr->BS_codec, blk,
818  (char *)&base, &out_sz);
819  seq[pos-1] = 'N'; // FIXME look up BS=base value
820  cig_op = BAM_CBASE_MISMATCH;
821 #else
822  int ref_base;
823  if (cig_len && cig_op != BAM_CMATCH) {
824  cigar[ncigar++] = (cig_len<<4) + cig_op;
825  cig_len = 0;
826  }
827  if (!c->comp_hdr->BS_codec) return -1;
828  r |= c->comp_hdr->BS_codec->decode(s, c->comp_hdr->BS_codec, blk,
829  (char *)&base, &out_sz);
830  if (ref_pos >= bfd->ref[cr->ref_id].len || !s->ref) {
831  seq[pos-1] = 'N';
832  } else {
833  ref_base = fd->L1[(uc)s->ref[ref_pos - s->ref_start +1]];
834  seq[pos-1] = c->comp_hdr->substitution_matrix[ref_base][base];
835  if (decode_md) {
836  BLOCK_APPENDF_2(s->aux_blk, buf, "%d%c",
837  md_dist, s->ref[ref_pos-s->ref_start +1]);
838  md_dist = 0;
839  }
840  }
841  cig_op = BAM_CMATCH;
842 #endif
843  nm++;
844  cig_len++;
845  seq_pos++;
846  ref_pos++;
847  break;
848  }
849 
850  case 'D': { // Deletion; DL
851  if (cig_len && cig_op != BAM_CDEL) {
852  cigar[ncigar++] = (cig_len<<4) + cig_op;
853  cig_len = 0;
854  }
855  if (!c->comp_hdr->DL_codec) return -1;
856  r |= c->comp_hdr->DL_codec->decode(s, c->comp_hdr->DL_codec, blk,
857  (char *)&i32, &out_sz);
858  if (decode_md) {
859  BLOCK_APPENDF_1(s->aux_blk, buf, "%d^", md_dist);
860  BLOCK_APPEND(s->aux_blk, &s->ref[ref_pos - s->ref_start +1],
861  i32);
862  md_dist = 0;
863  }
864  cig_op = BAM_CDEL;
865  cig_len += i32;
866  ref_pos += i32;
867  nm += i32;
868  //printf(" %d: DL = %d (ret %d)\n", f, i32, r);
869  break;
870  }
871 
872  case 'I': { // Insertion (several bases); IN
873  int32_t out_sz2 = 1;
874 
875  if (cig_len && cig_op != BAM_CINS) {
876  cigar[ncigar++] = (cig_len<<4) + cig_op;
877  cig_len = 0;
878  }
879 
880  if (!c->comp_hdr->IN_codec) return -1;
881  r |= c->comp_hdr->IN_codec->decode(s, c->comp_hdr->IN_codec, blk,
882  &seq[pos-1], &out_sz2);
883  cig_op = BAM_CINS;
884  cig_len += out_sz2;
885  seq_pos += out_sz2;
886  nm += out_sz2;
887  //printf(" %d: IN(I) = %.*s (ret %d, out_sz %d)\n", f, out_sz2, dat, r, out_sz2);
888  break;
889  }
890 
891  case 'i': { // Insertion (single base); BA
892  if (cig_len && cig_op != BAM_CINS) {
893  cigar[ncigar++] = (cig_len<<4) + cig_op;
894  cig_len = 0;
895  }
896  if (!c->comp_hdr->BA_codec) return -1;
897  r |= c->comp_hdr->BA_codec->decode(s, c->comp_hdr->BA_codec, blk,
898  (char *)&seq[pos-1], &out_sz);
899  cig_op = BAM_CINS;
900  cig_len++;
901  seq_pos++;
902  nm++;
903  //printf(" %d: BA = %c (ret %d)\n", f, seq[pos-1], r);
904  break;
905  }
906 
907  case 'B': { // Read base; BA, QS
908 #ifdef USE_X
909  if (cig_len && cig_op != BAM_CBASE_MISMATCH) {
910  cigar[ncigar++] = (cig_len<<4) + cig_op;
911  cig_len = 0;
912  }
913 #else
914  if (cig_len && cig_op != BAM_CMATCH) {
915  cigar[ncigar++] = (cig_len<<4) + cig_op;
916  cig_len = 0;
917  }
918 #endif
919  if (!c->comp_hdr->BA_codec) return -1;
920  r |= c->comp_hdr->BA_codec->decode(s, c->comp_hdr->BA_codec, blk,
921  (char *)&seq[pos-1], &out_sz);
922  if (!c->comp_hdr->QS_codec) return -1;
923  r |= c->comp_hdr->QS_codec->decode(s, c->comp_hdr->QS_codec, blk,
924  (char *)&qual[pos-1], &out_sz);
925 #ifdef USE_X
926  cig_op = BAM_CBASE_MISMATCH;
927 #else
928  cig_op = BAM_CMATCH;
929 #endif
930  cig_len++;
931  seq_pos++;
932  ref_pos++;
933  //printf(" %d: BA/QS(B) = %c/%d (ret %d)\n", f, i32, qc, r);
934  break;
935  }
936 
937  case 'Q': { // Quality score; QS
938  if (!c->comp_hdr->QS_codec) return -1;
939  r |= c->comp_hdr->QS_codec->decode(s, c->comp_hdr->QS_codec, blk,
940  (char *)&qual[pos-1], &out_sz);
941  //printf(" %d: QS = %d (ret %d)\n", f, qc, r);
942  break;
943  }
944 
945  case 'H': { // hard clip; HC
946  if (cig_len && cig_op != BAM_CHARD_CLIP) {
947  cigar[ncigar++] = (cig_len<<4) + cig_op;
948  cig_len = 0;
949  }
950  if (!c->comp_hdr->HC_codec) return -1;
951  r |= c->comp_hdr->HC_codec->decode(s, c->comp_hdr->HC_codec, blk,
952  (char *)&i32, &out_sz);
953  cig_op = BAM_CHARD_CLIP;
954  cig_len += i32;
955  nm += i32;
956  break;
957  }
958 
959  case 'P': { // padding; PD
960  if (cig_len && cig_op != BAM_CPAD) {
961  cigar[ncigar++] = (cig_len<<4) + cig_op;
962  cig_len = 0;
963  }
964  if (!c->comp_hdr->PD_codec) return -1;
965  r |= c->comp_hdr->PD_codec->decode(s, c->comp_hdr->PD_codec, blk,
966  (char *)&i32, &out_sz);
967  cig_op = BAM_CPAD;
968  cig_len += i32;
969  nm += i32;
970  break;
971  }
972 
973  case 'N': { // Ref skip; RS
974  if (cig_len && cig_op != BAM_CREF_SKIP) {
975  cigar[ncigar++] = (cig_len<<4) + cig_op;
976  cig_len = 0;
977  }
978  if (!c->comp_hdr->RS_codec) return -1;
979  r |= c->comp_hdr->RS_codec->decode(s, c->comp_hdr->RS_codec, blk,
980  (char *)&i32, &out_sz);
981  cig_op = BAM_CREF_SKIP;
982  cig_len += i32;
983  ref_pos += i32;
984  nm += i32;
985  break;
986  }
987 
988  default:
989  abort();
990  }
991  }
992 
993  /* An implement match op for any unaccounted for bases */
994  if (cr->len >= seq_pos) {
995  if (s->ref) {
996  if (ref_pos + cr->len - seq_pos + 1 > bfd->ref[cr->ref_id].len) {
997  static int whinged = 0;
998  if (!whinged)
999  fprintf(stderr, "Ref pos outside of ref sequence boundary\n");
1000  whinged = 1;
1001  } else {
1002  memcpy(&seq[seq_pos-1], &s->ref[ref_pos - s->ref_start +1],
1003  cr->len - seq_pos + 1);
1004  ref_pos += cr->len - seq_pos + 1;
1005  md_dist += cr->len - seq_pos + 1;
1006  }
1007  }
1008 
1009  if (ncigar+1 >= cigar_alloc) {
1010  cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
1011  s->cigar = cigar;
1012  if (!(cigar = realloc(cigar, cigar_alloc * sizeof(*cigar))))
1013  return -1;
1014  }
1015 #ifdef USE_X
1016  if (cig_len && cig_op != BAM_CBASE_MATCH) {
1017  cigar[ncigar++] = (cig_len<<4) + cig_op;
1018  cig_len = 0;
1019  }
1020  cig_op = BAM_CBASE_MATCH;
1021 #else
1022  if (cig_len && cig_op != BAM_CMATCH) {
1023  cigar[ncigar++] = (cig_len<<4) + cig_op;
1024  cig_len = 0;
1025  }
1026  cig_op = BAM_CMATCH;
1027 #endif
1028  cig_len += cr->len - seq_pos+1;
1029  }
1030  if (decode_md) {
1031  BLOCK_APPENDF_1(s->aux_blk, buf, "%d", md_dist);
1032  }
1033 
1034  if (cig_len) {
1035  if (ncigar >= cigar_alloc) {
1036  cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
1037  s->cigar = cigar;
1038  if (!(cigar = realloc(cigar, cigar_alloc * sizeof(*cigar))))
1039  return -1;
1040  }
1041 
1042  cigar[ncigar++] = (cig_len<<4) + cig_op;
1043  }
1044 
1045  cr->ncigar = ncigar - cr->cigar;
1046  cr->aend = ref_pos;
1047 
1048  //printf("2: %.*s %d .. %d\n", cr->name_len, DSTRING_STR(name_ds) + cr->name, cr->apos, ref_pos);
1049 
1050  if (!c->comp_hdr->MQ_codec) return -1;
1051  r |= c->comp_hdr->MQ_codec->decode(s, c->comp_hdr->MQ_codec, blk,
1052  (char *)&cr->mqual, &out_sz);
1053 
1054  if (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES) {
1055  int32_t out_sz2 = cr->len;
1056 
1057  if (!c->comp_hdr->Qs_codec) return -1;
1058  r |= c->comp_hdr->Qs_codec->decode(s, c->comp_hdr->Qs_codec, blk,
1059  qual, &out_sz2);
1060  }
1061 
1062  s->cigar = cigar;
1063  s->cigar_alloc = cigar_alloc;
1064  s->ncigar = ncigar;
1065 
1066  if (decode_md) {
1067  char buf[7];
1068  BLOCK_APPEND_CHAR(s->aux_blk, '\0'); // null terminate MD:Z:
1069  cr->aux_size += BLOCK_SIZE(s->aux_blk) - orig_aux;
1070  buf[0] = 'N'; buf[1] = 'M'; buf[2] = 'I';
1071  buf[3] = (nm>> 0) & 0xff;
1072  buf[4] = (nm>> 8) & 0xff;
1073  buf[5] = (nm>>16) & 0xff;
1074  buf[6] = (nm>>24) & 0xff;
1075  BLOCK_APPEND(s->aux_blk, buf, 7);
1076  cr->aux_size += 7;
1077  }
1078 
1079  return r;
1080 }
1081 
1082 /*
1083  * Quick and simple hash lookup for cram_map arrays
1084  */
1085 static cram_map *map_find(cram_map **map, unsigned char *key, int id) {
1086  cram_map *m;
1087 
1088  m = map[CRAM_MAP(key[0],key[1])];
1089  while (m && m->key != id)
1090  m= m->next;
1091 
1092  return m;
1093 }
1094 
1095 //#define map_find(M,K,I) M[CRAM_MAP(K[0],K[1])];while (m && m->key != I);m= m->next
1096 
1097 
1098 static int cram_decode_aux_1_0(cram_container *c, cram_slice *s,
1099  cram_block *blk, cram_record *cr) {
1100  int i, r = 0, out_sz = 1;
1101  unsigned char ntags;
1102 
1103  if (!c->comp_hdr->TC_codec) return -1;
1104  r |= c->comp_hdr->TC_codec->decode(s, c->comp_hdr->TC_codec, blk,
1105  (char *)&ntags, &out_sz);
1106  cr->ntags = ntags;
1107 
1108  //printf("TC=%d\n", cr->ntags);
1109  cr->aux_size = 0;
1110  cr->aux = BLOCK_SIZE(s->aux_blk);
1111 
1112  for (i = 0; i < cr->ntags; i++) {
1113  int32_t id, out_sz = 1;
1114  unsigned char tag_data[3];
1115  cram_map *m;
1116 
1117  //printf("Tag %d/%d\n", i+1, cr->ntags);
1118  if (!c->comp_hdr->TN_codec) return -1;
1119  r |= c->comp_hdr->TN_codec->decode(s, c->comp_hdr->TN_codec,
1120  blk, (char *)&id, &out_sz);
1121  if (out_sz == 3) {
1122  tag_data[0] = ((char *)&id)[0];
1123  tag_data[1] = ((char *)&id)[1];
1124  tag_data[2] = ((char *)&id)[2];
1125  } else {
1126  tag_data[0] = (id>>16) & 0xff;
1127  tag_data[1] = (id>>8) & 0xff;
1128  tag_data[2] = id & 0xff;
1129  }
1130 
1131  m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id);
1132  if (!m)
1133  return -1;
1134  BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3);
1135 
1136  if (!m->codec) return -1;
1137  r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz);
1138 
1139  cr->aux_size += out_sz + 3;
1140  }
1141 
1142  return r;
1143 }
1144 
1145 static int cram_decode_aux(cram_container *c, cram_slice *s,
1146  cram_block *blk, cram_record *cr) {
1147  int i, r = 0, out_sz = 1;
1148  int32_t TL;
1149  unsigned char *TN;
1150 
1151  if (!c->comp_hdr->TL_codec) return -1;
1152  r |= c->comp_hdr->TL_codec->decode(s, c->comp_hdr->TL_codec, blk,
1153  (char *)&TL, &out_sz);
1154  if (r || TL < 0 || TL >= c->comp_hdr->nTL)
1155  return -1;
1156 
1157  TN = c->comp_hdr->TL[TL];
1158  cr->ntags = strlen((char *)TN)/3; // optimise to remove strlen
1159 
1160  //printf("TC=%d\n", cr->ntags);
1161  cr->aux_size = 0;
1162  cr->aux = BLOCK_SIZE(s->aux_blk);
1163 
1164  for (i = 0; i < cr->ntags; i++) {
1165  int32_t id, out_sz = 1;
1166  unsigned char tag_data[3];
1167  cram_map *m;
1168 
1169  //printf("Tag %d/%d\n", i+1, cr->ntags);
1170  tag_data[0] = *TN++;
1171  tag_data[1] = *TN++;
1172  tag_data[2] = *TN++;
1173  id = (tag_data[0]<<16) | (tag_data[1]<<8) | tag_data[2];
1174 
1175  m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id);
1176  if (!m)
1177  return -1;
1178  BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3);
1179 
1180  if (!m->codec) return -1;
1181  r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz);
1182  cr->aux_size += out_sz + 3;
1183  }
1184 
1185  return r;
1186 }
1187 
1188 /* Resolve mate pair cross-references between recs within this slice */
1189 static void cram_decode_slice_xref(cram_slice *s) {
1190  int rec;
1191 
1192  for (rec = 0; rec < s->hdr->num_records; rec++) {
1193  cram_record *cr = &s->crecs[rec];
1194 
1195  if (cr->mate_line >= 0) {
1196  if (cr->mate_line < s->hdr->num_records) {
1197  /*
1198  * On the first read, loop through computing lengths.
1199  * It's not perfect as we have one slice per reference so we
1200  * cannot detect when TLEN should be zero due to seqs that
1201  * map to multiple references.
1202  *
1203  * We also cannot set tlen correct when it spans a slice for
1204  * other reasons. This may make tlen too small. Should we
1205  * fix this by forcing TLEN to be stored verbatim in such cases?
1206  *
1207  * Or do we just admit defeat and output 0 for tlen? It's the
1208  * safe option...
1209  */
1210  if (cr->tlen == INT_MIN) {
1211  int id1 = rec, id2 = rec;
1212  int aleft = cr->apos, aright = cr->aend;
1213  int tlen;
1214  int ref = cr->ref_id;
1215 
1216  do {
1217  if (aleft > s->crecs[id2].apos)
1218  aleft = s->crecs[id2].apos;
1219  if (aright < s->crecs[id2].aend)
1220  aright = s->crecs[id2].aend;
1221  if (s->crecs[id2].mate_line == -1) {
1222  s->crecs[id2].mate_line = rec;
1223  break;
1224  }
1225  assert(s->crecs[id2].mate_line > id2);
1226  id2 = s->crecs[id2].mate_line;
1227 
1228  if (s->crecs[id2].ref_id != ref)
1229  ref = -1;
1230  } while (id2 != id1);
1231 
1232  if (ref != -1) {
1233  tlen = aright - aleft + 1;
1234  id1 = id2 = rec;
1235 
1236  /*
1237  * When we have two seqs with identical start and
1238  * end coordinates, set +/- tlen based on 1st/last
1239  * bit flags instead, as a tie breaker.
1240  */
1241  if (s->crecs[id2].apos == aleft) {
1242  if (s->crecs[id2].aend != aright)
1243  s->crecs[id2].tlen = tlen;
1244  else if (s->crecs[id2].flags & BAM_FREAD1)
1245  s->crecs[id2].tlen = tlen;
1246  else
1247  s->crecs[id2].tlen = -tlen;
1248  } else {
1249  s->crecs[id2].tlen = -tlen;
1250  }
1251 
1252  id2 = s->crecs[id2].mate_line;
1253  while (id2 != id1) {
1254  if (s->crecs[id2].apos == aleft) {
1255  if (s->crecs[id2].aend != aright)
1256  s->crecs[id2].tlen = tlen;
1257  else if (s->crecs[id2].flags & BAM_FREAD1)
1258  s->crecs[id2].tlen = tlen;
1259  else
1260  s->crecs[id2].tlen = -tlen;
1261  } else {
1262  s->crecs[id2].tlen = -tlen;
1263  }
1264  id2 = s->crecs[id2].mate_line;
1265  }
1266  } else {
1267  id1 = id2 = rec;
1268 
1269  s->crecs[id2].tlen = 0;
1270  id2 = s->crecs[id2].mate_line;
1271  while (id2 != id1) {
1272  s->crecs[id2].tlen = 0;
1273  id2 = s->crecs[id2].mate_line;
1274  }
1275  }
1276  }
1277 
1278  cr->mate_pos = s->crecs[cr->mate_line].apos;
1279  cr->mate_ref_id = s->crecs[cr->mate_line].ref_id;
1280 
1281  // paired
1282  cr->flags |= BAM_FPAIRED;
1283 
1284  // set mate unmapped if needed
1285  if (s->crecs[cr->mate_line].flags & BAM_FUNMAP) {
1286  cr->flags |= BAM_FMUNMAP;
1287  cr->tlen = 0;
1288  }
1289  if (cr->flags & BAM_FUNMAP) {
1290  cr->tlen = 0;
1291  }
1292 
1293  // set mate reversed if needed
1294  if (s->crecs[cr->mate_line].flags & BAM_FREVERSE)
1295  cr->flags |= BAM_FMREVERSE;
1296  } else {
1297  fprintf(stderr, "Mate line out of bounds: %d vs [0, %d]\n",
1298  cr->mate_line, s->hdr->num_records-1);
1299  }
1300 
1301  /* FIXME: construct read names here too if needed */
1302  } else {
1303  if (cr->mate_flags & CRAM_M_REVERSE) {
1304  cr->flags |= BAM_FPAIRED | BAM_FMREVERSE;
1305  }
1306  if (cr->mate_flags & CRAM_M_UNMAP) {
1307  cr->flags |= BAM_FMUNMAP;
1308  //cr->mate_ref_id = -1;
1309  }
1310  if (!(cr->flags & BAM_FPAIRED))
1311  cr->mate_ref_id = -1;
1312  }
1313 
1314  if (cr->tlen == INT_MIN)
1315  cr->tlen = 0; // Just incase
1316  }
1317 }
1318 
1319 static char *md5_print(unsigned char *md5, char *out) {
1320  int i;
1321  for (i = 0; i < 16; i++) {
1322  out[i*2+0] = "0123456789abcdef"[md5[i]>>4];
1323  out[i*2+1] = "0123456789abcdef"[md5[i]&15];
1324  }
1325  out[32] = 0;
1326 
1327  return out;
1328 }
1329 
1330 /*
1331  * Decode an entire slice from container blocks. Fills out s->crecs[] array.
1332  * Returns 0 on success
1333  * -1 on failure
1334  */
1336  SAM_hdr *bfd) {
1337  cram_block *blk = s->block[0];
1338  int32_t bf, ref_id;
1339  unsigned char cf;
1340  int out_sz, r = 0;
1341  int rec;
1342  char *seq, *qual;
1343  int unknown_rg = -1;
1344  int id, embed_ref;
1345  char **refs = NULL;
1346 
1347  for (id = 0; id < s->hdr->num_blocks; id++) {
1348  if (cram_uncompress_block(s->block[id]))
1349  return -1;
1350  }
1351 
1352  blk->bit = 7; // MSB first
1353 
1354  /* Look for unknown RG, added as last by Java CRAM? */
1355  if (bfd->nrg > 0 &&
1356  !strcmp(bfd->rg[bfd->nrg-1].name, "UNKNOWN"))
1357  unknown_rg = bfd->nrg-1;
1358 
1359  if (blk->content_type != CORE)
1360  return -1;
1361 
1362  if (s->crecs)
1363  free(s->crecs);
1364  if (!(s->crecs = malloc(s->hdr->num_records * sizeof(*s->crecs))))
1365  return -1;
1366 
1367  ref_id = s->hdr->ref_seq_id;
1368  embed_ref = s->hdr->ref_base_id >= 0 ? 1 : 0;
1369 
1370  if (ref_id >= 0) {
1371  if (embed_ref) {
1372  cram_block *b;
1373  if (s->hdr->ref_base_id < 0) {
1374  fprintf(stderr, "No reference specified and "
1375  "no embedded reference is available.\n");
1376  return -1;
1377  }
1378  if (!s->block_by_id ||
1379  !(b = s->block_by_id[s->hdr->ref_base_id]))
1380  return -1;
1381  s->ref = (char *)BLOCK_DATA(b);
1382  s->ref_start = s->hdr->ref_seq_start;
1383  s->ref_end = s->hdr->ref_seq_start + s->hdr->ref_seq_span-1;
1384  } else if (!fd->no_ref) {
1386  //s->ref = cram_get_ref(fd, s->hdr->ref_seq_id, 1, 0);
1387  //s->ref_start = 1;
1388 
1389  s->ref =
1390  cram_get_ref(fd, s->hdr->ref_seq_id,
1391  s->hdr->ref_seq_start,
1392  s->hdr->ref_seq_start + s->hdr->ref_seq_span -1);
1393  s->ref_start = s->hdr->ref_seq_start;
1394  s->ref_end = s->hdr->ref_seq_start + s->hdr->ref_seq_span-1;
1395 
1396  /* Sanity check */
1397  if (s->ref_start < 0) {
1398  fprintf(stderr, "Slice starts before base 1.\n");
1399  s->ref_start = 0;
1400  }
1401  pthread_mutex_lock(&fd->ref_lock);
1402  pthread_mutex_lock(&fd->refs->lock);
1403  if (s->ref_end > fd->refs->ref_id[ref_id]->length) {
1404  fprintf(stderr, "Slice ends beyond reference end.\n");
1405  s->ref_end = fd->refs->ref_id[ref_id]->length;
1406  }
1407  pthread_mutex_unlock(&fd->refs->lock);
1408  pthread_mutex_unlock(&fd->ref_lock);
1409  }
1410  }
1411 
1412  if (s->ref == NULL && s->hdr->ref_seq_id >= 0 && !fd->no_ref) {
1413  fprintf(stderr, "Unable to fetch reference #%d %d..%d\n",
1414  s->hdr->ref_seq_id, s->hdr->ref_seq_start,
1415  s->hdr->ref_seq_start + s->hdr->ref_seq_span-1);
1416  return -1;
1417  }
1418 
1419  if (fd->version != CRAM_1_VERS && s->hdr->ref_seq_id >= 0
1420  && !fd->ignore_md5
1421  && memcmp(s->hdr->md5, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0", 16)) {
1422  MD5_CTX md5;
1423  unsigned char digest[16];
1424 
1425  if (s->ref && s->hdr->ref_seq_id >= 0) {
1426  int start, len;
1427 
1428  if (s->hdr->ref_seq_start >= s->ref_start) {
1429  start = s->hdr->ref_seq_start - s->ref_start;
1430  } else {
1431  fprintf(stderr, "Slice starts before base 1.\n");
1432  start = 0;
1433  }
1434 
1435  if (s->hdr->ref_seq_span <= s->ref_end - s->ref_start + 1) {
1436  len = s->hdr->ref_seq_span;
1437  } else {
1438  fprintf(stderr, "Slice ends beyond reference end.\n");
1439  len = s->ref_end - s->ref_start + 1;
1440  }
1441 
1442  MD5_Init(&md5);
1443  if (start + len > s->ref_end - s->ref_start + 1)
1444  len = s->ref_end - s->ref_start + 1 - start;
1445  if (len >= 0)
1446  MD5_Update(&md5, s->ref + start, len);
1447  MD5_Final(digest, &md5);
1448  } else if (!s->ref && s->hdr->ref_base_id >= 0) {
1449  cram_block *b;
1450  if (s->block_by_id && (b = s->block_by_id[s->hdr->ref_base_id])) {
1451  MD5_Init(&md5);
1452  MD5_Update(&md5, b->data, b->uncomp_size);
1453  MD5_Final(digest, &md5);
1454  }
1455  }
1456 
1457  if ((!s->ref && s->hdr->ref_base_id < 0)
1458  || memcmp(digest, s->hdr->md5, 16) != 0) {
1459  char M[33];
1460  fprintf(stderr, "ERROR: md5sum reference mismatch for ref "
1461  "%d pos %d..%d\n", ref_id, s->ref_start, s->ref_end);
1462  fprintf(stderr, "CRAM: %s\n", md5_print(s->hdr->md5, M));
1463  fprintf(stderr, "Ref : %s\n", md5_print(digest, M));
1464  return -1;
1465  }
1466  }
1467 
1468  if (ref_id == -2) {
1469  pthread_mutex_lock(&fd->ref_lock);
1470  pthread_mutex_lock(&fd->refs->lock);
1471  refs = calloc(fd->refs->nref, sizeof(char *));
1472  pthread_mutex_unlock(&fd->refs->lock);
1473  pthread_mutex_unlock(&fd->ref_lock);
1474  if (!refs)
1475  return -1;
1476  }
1477 
1478  for (rec = 0; rec < s->hdr->num_records; rec++) {
1479  cram_record *cr = &s->crecs[rec];
1480 
1481  //fprintf(stderr, "Decode seq %d, %d/%d\n", rec, blk->byte, blk->bit);
1482 
1483  cr->s = s;
1484 
1485  out_sz = 1; /* decode 1 item */
1486  if (!c->comp_hdr->BF_codec) return -1;
1487  r |= c->comp_hdr->BF_codec->decode(s, c->comp_hdr->BF_codec, blk,
1488  (char *)&bf, &out_sz);
1489  if (bf < 0 ||
1490  bf >= sizeof(fd->bam_flag_swap)/sizeof(*fd->bam_flag_swap))
1491  return -1;
1492  bf = fd->bam_flag_swap[bf];
1493  cr->flags = bf;
1494 
1495  if (fd->version == CRAM_1_VERS) {
1496  /* CF is byte in 1.0, int32 in 2.0 */
1497  if (!c->comp_hdr->CF_codec) return -1;
1498  r |= c->comp_hdr->CF_codec->decode(s, c->comp_hdr->CF_codec, blk,
1499  (char *)&cf, &out_sz);
1500  cr->cram_flags = cf;
1501  } else {
1502  if (!c->comp_hdr->CF_codec) return -1;
1503  r |= c->comp_hdr->CF_codec->decode(s, c->comp_hdr->CF_codec, blk,
1504  (char *)&cr->cram_flags,
1505  &out_sz);
1506  cf = cr->cram_flags;
1507  }
1508 
1509  if (fd->version != CRAM_1_VERS && ref_id == -2) {
1510  if (!c->comp_hdr->RI_codec) return -1;
1511  r |= c->comp_hdr->RI_codec->decode(s, c->comp_hdr->RI_codec, blk,
1512  (char *)&cr->ref_id, &out_sz);
1513  if (cr->ref_id >= 0) {
1514  if (!fd->no_ref) {
1515  if (!refs[cr->ref_id])
1516  refs[cr->ref_id] = cram_get_ref(fd, cr->ref_id, 1, 0);
1517  s->ref = refs[cr->ref_id];
1518  }
1519  s->ref_start = 1;
1520  pthread_mutex_lock(&fd->ref_lock);
1521  pthread_mutex_lock(&fd->refs->lock);
1522  s->ref_end = fd->refs->ref_id[cr->ref_id]->length;
1523  pthread_mutex_unlock(&fd->refs->lock);
1524  pthread_mutex_unlock(&fd->ref_lock);
1525  }
1526  } else {
1527  cr->ref_id = ref_id; // Forced constant in CRAM 1.0
1528  }
1529 
1530 
1531  if (!c->comp_hdr->RL_codec) return -1;
1532  r |= c->comp_hdr->RL_codec->decode(s, c->comp_hdr->RL_codec, blk,
1533  (char *)&cr->len, &out_sz);
1534 
1535  if (!c->comp_hdr->AP_codec) return -1;
1536  r |= c->comp_hdr->AP_codec->decode(s, c->comp_hdr->AP_codec, blk,
1537  (char *)&cr->apos, &out_sz);
1538  if (c->comp_hdr->AP_delta)
1539  cr->apos += s->last_apos;
1540  s->last_apos= cr->apos;
1541 
1542  if (!c->comp_hdr->RG_codec) return -1;
1543  r |= c->comp_hdr->RG_codec->decode(s, c->comp_hdr->RG_codec, blk,
1544  (char *)&cr->rg, &out_sz);
1545  if (cr->rg == unknown_rg)
1546  cr->rg = -1;
1547 
1548  cr->name_len = 0;
1549 
1550  if (c->comp_hdr->read_names_included) {
1551  int32_t out_sz2 = 1;
1552 
1553  // Read directly into name cram_block
1554  cr->name = BLOCK_SIZE(s->name_blk);
1555  if (!c->comp_hdr->RN_codec) return -1;
1556  r |= c->comp_hdr->RN_codec->decode(s, c->comp_hdr->RN_codec, blk,
1557  (char *)s->name_blk, &out_sz2);
1558  cr->name_len = out_sz2;
1559  }
1560 
1561  cr->mate_line = -1;
1562  cr->mate_ref_id = -1;
1563  if (cf & CRAM_FLAG_DETACHED) {
1564  if (fd->version == CRAM_1_VERS) {
1565  /* MF is byte in 1.0, int32 in 2.0 */
1566  unsigned char mf;
1567  if (!c->comp_hdr->MF_codec) return -1;
1568  r |= c->comp_hdr->MF_codec->decode(s, c->comp_hdr->MF_codec,
1569  blk, (char *)&mf, &out_sz);
1570  cr->mate_flags = mf;
1571  } else {
1572  if (!c->comp_hdr->MF_codec) return -1;
1573  r |= c->comp_hdr->MF_codec->decode(s, c->comp_hdr->MF_codec,
1574  blk,
1575  (char *)&cr->mate_flags,
1576  &out_sz);
1577  }
1578 
1579  if (!c->comp_hdr->read_names_included) {
1580  int32_t out_sz2 = 1;
1581 
1582  // Read directly into name cram_block
1583  cr->name = BLOCK_SIZE(s->name_blk);
1584  if (!c->comp_hdr->RN_codec) return -1;
1585  r |= c->comp_hdr->RN_codec->decode(s, c->comp_hdr->RN_codec,
1586  blk, (char *)s->name_blk,
1587  &out_sz2);
1588  cr->name_len = out_sz2;
1589  }
1590 
1591  if (!c->comp_hdr->NS_codec) return -1;
1592  r |= c->comp_hdr->NS_codec->decode(s, c->comp_hdr->NS_codec, blk,
1593  (char *)&cr->mate_ref_id, &out_sz);
1594 
1595 // Skip as mate_ref of "*" is legit. It doesn't mean unmapped, just unknown.
1596 // if (cr->mate_ref_id == -1 && cr->flags & 0x01) {
1597 // /* Paired, but unmapped */
1598 // cr->flags |= BAM_FMUNMAP;
1599 // }
1600 
1601  if (!c->comp_hdr->NP_codec) return -1;
1602  r |= c->comp_hdr->NP_codec->decode(s, c->comp_hdr->NP_codec, blk,
1603  (char *)&cr->mate_pos, &out_sz);
1604  if (!c->comp_hdr->TS_codec) return -1;
1605  r |= c->comp_hdr->TS_codec->decode(s, c->comp_hdr->TS_codec, blk,
1606  (char *)&cr->tlen, &out_sz);
1607  } else if (cf & CRAM_FLAG_MATE_DOWNSTREAM) {
1608  if (!c->comp_hdr->NF_codec) return -1;
1609  r |= c->comp_hdr->NF_codec->decode(s, c->comp_hdr->NF_codec, blk,
1610  (char *)&cr->mate_line, &out_sz);
1611  cr->mate_line += rec + 1;
1612 
1613  //cr->name_len = sprintf(name, "%d", name_id++);
1614  //cr->name = DSTRING_LEN(name_ds);
1615  //dstring_nappend(name_ds, name, cr->name_len);
1616 
1617  cr->mate_ref_id = -1;
1618  cr->tlen = INT_MIN;
1619  cr->mate_pos = 0;
1620  } else {
1621  cr->mate_flags = 0;
1622  cr->tlen = INT_MIN;
1623  }
1624  /*
1625  else if (!name[0]) {
1626  //name[0] = '?'; name[1] = 0;
1627  //cr->name_len = 1;
1628  //cr->name= DSTRING_LEN(s->name_ds);
1629  //dstring_nappend(s->name_ds, "?", 1);
1630 
1631  cr->mate_ref_id = -1;
1632  cr->tlen = 0;
1633  cr->mate_pos = 0;
1634  }
1635  */
1636 
1637  /* Auxiliary tags */
1638  if (fd->version == CRAM_1_VERS)
1639  r |= cram_decode_aux_1_0(c, s, blk, cr);
1640  else
1641  r |= cram_decode_aux(c, s, blk, cr);
1642 
1643  /* Fake up dynamic string growth and appending */
1644  cr->seq = BLOCK_SIZE(s->seqs_blk);
1645  BLOCK_GROW(s->seqs_blk, cr->len);
1646  seq = (char *)BLOCK_END(s->seqs_blk);
1647  BLOCK_SIZE(s->seqs_blk) += cr->len;
1648 
1649  if (!seq)
1650  return -1;
1651 
1652  cr->qual = BLOCK_SIZE(s->qual_blk);
1653  BLOCK_GROW(s->qual_blk, cr->len);
1654  qual = (char *)BLOCK_END(s->qual_blk);
1655  BLOCK_SIZE(s->qual_blk) += cr->len;
1656 
1657  if (!s->ref)
1658  memset(seq, '=', cr->len);
1659 
1660  if (!(bf & BAM_FUNMAP)) {
1661  /* Decode sequence and generate CIGAR */
1662  r |= cram_decode_seq(fd, c, s, blk, cr, bfd, cf, seq, qual);
1663  } else {
1664  int out_sz2 = cr->len;
1665 
1666  //puts("Unmapped");
1667  cr->cigar = 0;
1668  cr->ncigar = 0;
1669  cr->aend = cr->apos;
1670  cr->mqual = 0;
1671 
1672  if (!c->comp_hdr->BA_codec) return -1;
1673  r |= c->comp_hdr->BA_codec->decode(s, c->comp_hdr->BA_codec, blk,
1674  (char *)seq, &out_sz2);
1675 
1676  if (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES) {
1677  out_sz2 = cr->len;
1678  if (!c->comp_hdr->Qs_codec) return -1;
1679  r |= c->comp_hdr->Qs_codec->decode(s, c->comp_hdr->Qs_codec,
1680  blk, qual, &out_sz2);
1681  } else {
1682  memset(qual, 30, cr->len);
1683  }
1684  }
1685  }
1686 
1687  pthread_mutex_lock(&fd->ref_lock);
1688  if (refs) {
1689  int i;
1690  for (i = 0; i < fd->refs->nref; i++) {
1691  if (refs[i])
1692  cram_ref_decr(fd->refs, i);
1693  }
1694  free(refs);
1695  } else if (ref_id >= 0 && s->ref != fd->ref_free) {
1696  cram_ref_decr(fd->refs, ref_id);
1697  }
1698  pthread_mutex_unlock(&fd->ref_lock);
1699 
1700  /* Resolve mate pair cross-references between recs within this slice */
1701  cram_decode_slice_xref(s);
1702 
1703  return r;
1704 }
1705 
1706 typedef struct {
1712 } cram_decode_job;
1713 
1714 void *cram_decode_slice_thread(void *arg) {
1715  cram_decode_job *j = (cram_decode_job *)arg;
1716 
1717  j->exit_code = cram_decode_slice(j->fd, j->c, j->s, j->h);
1718 
1719  return j;
1720 }
1721 
1722 /*
1723  * Spawn a multi-threaded version of cram_decode_slice().
1724  */
1726  SAM_hdr *bfd) {
1727  cram_decode_job *j;
1728  int nonblock;
1729 
1730  if (!fd->pool)
1731  return cram_decode_slice(fd, c, s, bfd);
1732 
1733  if (!(j = malloc(sizeof(*j))))
1734  return -1;
1735 
1736  j->fd = fd;
1737  j->c = c;
1738  j->s = s;
1739  j->h = bfd;
1740 
1741  nonblock = t_pool_results_queue_len(fd->rqueue) ? 0 : 1;
1742 
1744  j, nonblock)) {
1745  /* Would block */
1746  fd->job_pending = j;
1747  } else {
1748  fd->job_pending = NULL;
1749  }
1750 
1751  // flush too
1752  return 0;
1753 }
1754 
1755 
1756 /* ----------------------------------------------------------------------
1757  * CRAM sequence iterators.
1758  */
1759 
1760 /*
1761  * Converts a cram in-memory record into a bam in-memory record. We
1762  * pass a pointer to a bam_seq_t pointer along with the a pointer to
1763  * the allocated size. These can initially be pointers to NULL and zero.
1764  *
1765  * This function will reallocate the bam buffer as required and update
1766  * (*bam)->alloc accordingly, allowing it to be used within a loop
1767  * efficiently without needing to allocate new bam objects over and
1768  * over again.
1769  *
1770  * Returns the used size of the bam record on success
1771  * -1 on failure.
1772  */
1773 static int cram_to_bam(SAM_hdr *bfd, cram_fd *fd, cram_slice *s,
1774  cram_record *cr, int rec, bam_seq_t **bam) {
1775  int bam_idx, rg_len;
1776  char name_a[1024], *name;
1777  int name_len;
1778  char *aux, *aux_orig;
1779 
1780  /* Assign names if not explicitly set */
1781  if (cr->name_len) {
1782  name = (char *)BLOCK_DATA(s->name_blk) + cr->name;
1783  name_len = cr->name_len;
1784  } else {
1785  // FIXME: add prefix, container number, slice number, etc
1786  name = name_a;
1787 
1788  if (cr->mate_line >= 0 && cr->mate_line < rec)
1789  name_len = sprintf(name_a, "%s:%"PRId64":%d",
1790  fd->prefix, s->id, cr->mate_line);
1791  else
1792  name_len = sprintf(name_a, "%s:%"PRId64":%d",
1793  fd->prefix, s->id, rec);
1794  }
1795 
1796  /* Generate BAM record */
1797  if (cr->rg < -1 || cr->rg >= bfd->nrg)
1798  return -1;
1799  rg_len = (cr->rg != -1) ? bfd->rg[cr->rg].name_len + 4 : 0;
1800 
1801  if (!BLOCK_DATA(s->seqs_blk))
1802  return -1;
1803  if (!BLOCK_DATA(s->qual_blk))
1804  return -1;
1805 
1806  bam_idx = bam_construct_seq(bam, cr->aux_size + rg_len,
1807  name, name_len,
1808  cr->flags,
1809  cr->ref_id,
1810  cr->apos,
1811  cr->aend,
1812  cr->mqual,
1813  cr->ncigar, &s->cigar[cr->cigar],
1814  cr->mate_ref_id,
1815  cr->mate_pos,
1816  cr->tlen,
1817  cr->len,
1818  (char *)BLOCK_DATA(s->seqs_blk) + cr->seq,
1819  (char *)BLOCK_DATA(s->qual_blk) + cr->qual);
1820  if (bam_idx == -1)
1821  return -1;
1822 
1823  aux = aux_orig = (char *)bam_aux(*bam);
1824 
1825  /* Auxiliary strings */
1826  if (cr->aux_size != 0) {
1827  memcpy(aux, BLOCK_DATA(s->aux_blk) + cr->aux, cr->aux_size);
1828  aux += cr->aux_size;
1829  }
1830 
1831  /* RG:Z: */
1832  if (cr->rg != -1) {
1833  int len = bfd->rg[cr->rg].name_len;
1834  *aux++ = 'R'; *aux++ = 'G'; *aux++ = 'Z';
1835  memcpy(aux, bfd->rg[cr->rg].name, len);
1836  aux += len;
1837  *aux++ = 0;
1838  }
1839 
1840 #ifndef SAMTOOLS
1841  bam_set_blk_size(*bam, bam_blk_size(*bam) + (aux - aux_orig));
1842 #endif
1843 
1844  *aux++ = 0;
1845 
1846  return bam_idx + (aux - aux_orig);
1847 }
1848 
1849 /*
1850  * Here be dragons! The multi-threading code in this is crufty beyond belief.
1851  */
1852 static cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) {
1853  cram_container *c;
1854  cram_slice *s = NULL;
1855 
1856  fd->eof = 0;
1857 
1858  if (!(c = fd->ctr)) {
1859  // Load first container.
1860  do {
1861  if (!(c = fd->ctr = cram_read_container(fd)))
1862  return NULL;
1863  } while (c->length == 0);
1864 
1865  /*
1866  * The first container may be a result of a sub-range query.
1867  * In which case it may still not be the optimal starting point
1868  * due to skipped containers/slices in the index.
1869  */
1870  if (fd->range.refid != -2) {
1871  while (c->ref_seq_id != -2 &&
1872  (c->ref_seq_id < fd->range.refid ||
1873  c->ref_seq_start + c->ref_seq_span-1 < fd->range.start)) {
1874  if (0 != cram_seek(fd, c->length, SEEK_CUR))
1875  return NULL;
1876  cram_free_container(fd->ctr);
1877  do {
1878  if (!(c = fd->ctr = cram_read_container(fd)))
1879  return NULL;
1880  } while (c->length == 0);
1881  }
1882 
1883  if (c->ref_seq_id != -2 && c->ref_seq_id != fd->range.refid)
1884  return NULL;
1885  }
1886 
1887  if (!(c->comp_hdr_block = cram_read_block(fd)))
1888  return NULL;
1890  return NULL;
1891 
1893  if (!c->comp_hdr)
1894  return NULL;
1895  if (!c->comp_hdr->AP_delta) {
1896  pthread_mutex_lock(&fd->ref_lock);
1897  fd->unsorted = 1;
1898  pthread_mutex_unlock(&fd->ref_lock);
1899  }
1900  }
1901 
1902  if ((s = c->slice))
1903  cram_free_slice(s);
1904 
1905  if (c->curr_slice == c->max_slice) {
1907  c = NULL;
1908  }
1909 
1910  /* Sorry this is so contorted! */
1911  for (;;) {
1912  if (fd->job_pending) {
1914  c = j->c;
1915  s = j->s;
1916  free(fd->job_pending);
1917  fd->job_pending = NULL;
1918  } else if (!fd->ooc) {
1919  empty_container:
1920  if (!c || c->curr_slice == c->max_slice) {
1921  // new container
1922  do {
1923  if (!(c = fd->ctr = cram_read_container(fd))) {
1924  if (fd->pool) {
1925  fd->ooc = 1;
1926  break;
1927  }
1928 
1929  return NULL;
1930  }
1931  } while (c->length == 0);
1932  if (fd->ooc)
1933  break;
1934 
1935  /* Skip containers not yet spanning our range */
1936  if (fd->range.refid != -2 && c->ref_seq_id != -2) {
1937  if (c->ref_seq_id != fd->range.refid) {
1938  fd->eof = 1;
1939  return NULL;
1940  }
1941 
1942  if (c->ref_seq_start > fd->range.end) {
1943  fd->eof = 1;
1944  return NULL;
1945  }
1946 
1947  if (c->ref_seq_start + c->ref_seq_span-1 <
1948  fd->range.start) {
1949  c->curr_rec = c->max_rec;
1950  c->curr_slice = c->max_slice;
1951  cram_seek(fd, c->length, SEEK_CUR);
1953  c = NULL;
1954  continue;
1955  }
1956  }
1957 
1958  if (!(c->comp_hdr_block = cram_read_block(fd)))
1959  return NULL;
1961  return NULL;
1962 
1963  c->comp_hdr =
1965  if (!c->comp_hdr)
1966  return NULL;
1967 
1968  if (!c->comp_hdr->AP_delta) {
1969  pthread_mutex_lock(&fd->ref_lock);
1970  fd->unsorted = 1;
1971  pthread_mutex_unlock(&fd->ref_lock);
1972  }
1973  }
1974 
1975  if (c->num_records == 0) {
1976  cram_free_container(c); c = NULL;
1977  goto empty_container;
1978  }
1979 
1980 
1981  if (!(s = c->slice = cram_read_slice(fd)))
1982  return NULL;
1983  c->curr_slice++;
1984  c->curr_rec = 0;
1985  c->max_rec = s->hdr->num_records;
1986 
1987  s->last_apos = s->hdr->ref_seq_start;
1988 
1989  /* Skip slices not yet spanning our range */
1990  if (fd->range.refid != -2 && s->hdr->ref_seq_id != -2) {
1991  if (s->hdr->ref_seq_id != fd->range.refid) {
1992  fd->eof = 1;
1993  cram_free_slice(s);
1994  c->slice = NULL;
1995  return NULL;
1996  }
1997 
1998  if (s->hdr->ref_seq_start > fd->range.end) {
1999  fd->eof = 1;
2000  cram_free_slice(s);
2001  c->slice = NULL;
2002  return NULL;
2003  }
2004 
2005  if (s->hdr->ref_seq_start + s->hdr->ref_seq_span-1 <
2006  fd->range.start) {
2007  cram_free_slice(s);
2008  c->slice = NULL;
2010  c = NULL;
2011  continue;
2012  }
2013  }
2014  }
2015 
2016  /* Test decoding of 1st seq */
2017  if (!c || !s)
2018  break;
2019 
2020  if (cram_decode_slice_mt(fd, c, s, fd->header) != 0) {
2021  // if (cram_decode_slice(fd, c, s, fd->header) != 0) {
2022  fprintf(stderr, "Failure to decode slice\n");
2023  cram_free_slice(s);
2024  c->slice = NULL;
2025  return NULL;
2026  }
2027 
2028  if (!fd->pool || fd->job_pending)
2029  break;
2030 
2031  if (t_pool_results_queue_sz(fd->rqueue) > fd->pool->qsize)
2032  break;
2033  }
2034 
2035  if (fd->pool) {
2036  t_pool_result *res;
2037  cram_decode_job *j;
2038 
2039 // fprintf(stderr, "Thread pool len = %d, %d\n",
2040 // t_pool_results_queue_len(fd->rqueue),
2041 // t_pool_results_queue_sz(fd->rqueue));
2042 
2043  if (fd->ooc && t_pool_results_queue_empty(fd->rqueue))
2044  return NULL;
2045 
2046  res = t_pool_next_result_wait(fd->rqueue);
2047 
2048  if (!res || !res->data) {
2049  fprintf(stderr, "t_pool_next_result failure\n");
2050  return NULL;
2051  }
2052 
2053  j = (cram_decode_job *)res->data;
2054  c = j->c;
2055  s = j->s;
2056 
2057  t_pool_delete_result(res, 1);
2058  }
2059 
2060  *cp = c;
2061  return s;
2062 }
2063 
2064 /*
2065  * Read the next cram record and return it.
2066  * Note that to decode cram_record the caller will need to look up some data
2067  * in the current slice, pointed to by fd->ctr->slice. This is valid until
2068  * the next call to cram_get_seq (which may invalidate it).
2069  *
2070  * Returns record pointer on success (do not free)
2071  * NULL on failure
2072  */
2074  cram_container *c;
2075  cram_slice *s;
2076 
2077  for (;;) {
2078  c = fd->ctr;
2079  if (c && c->slice && c->curr_rec < c->max_rec) {
2080  s = c->slice;
2081  } else {
2082  if (!(s = cram_next_slice(fd, &c)))
2083  return NULL;
2084  }
2085 
2086  if (fd->range.refid != -2) {
2087  if (s->crecs[c->curr_rec].ref_id < fd->range.refid) {
2088  c->curr_rec++;
2089  continue;
2090  }
2091 
2092  if (s->crecs[c->curr_rec].ref_id != fd->range.refid) {
2093  fd->eof = 1;
2094  cram_free_slice(s);
2095  c->slice = NULL;
2096  return NULL;
2097  }
2098 
2099  if (s->crecs[c->curr_rec].apos > fd->range.end) {
2100  fd->eof = 1;
2101  cram_free_slice(s);
2102  c->slice = NULL;
2103  return NULL;
2104  }
2105 
2106  if (s->crecs[c->curr_rec].aend < fd->range.start) {
2107  c->curr_rec++;
2108  continue;
2109  }
2110  }
2111 
2112  break;
2113  }
2114 
2115  fd->ctr = c;
2116  c->slice = s;
2117  return &s->crecs[c->curr_rec++];
2118 }
2119 
2120 /*
2121  * Read the next cram record and convert it to a bam_seq_t struct.
2122  *
2123  * Returns 0 on success
2124  * -1 on EOF or failure (check fd->err)
2125  */
2127  cram_record *cr;
2128  cram_container *c;
2129  cram_slice *s;
2130 
2131  if (!(cr = cram_get_seq(fd)))
2132  return -1;
2133 
2134  c = fd->ctr;
2135  s = c->slice;
2136 
2137  return cram_to_bam(fd->header, fd, s, cr, c->curr_rec-1, bam);
2138 }