NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
cram_encode.c
Go to the documentation of this file.
1 /*
2 Copyright (c) 2012-2013 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4 
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7 
8  1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10 
11  2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14 
15  3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18 
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30 
31 #ifdef HAVE_CONFIG_H
32 #include "io_lib_config.h"
33 #endif
34 
35 #include <stdio.h>
36 #include <errno.h>
37 #include <assert.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include <zlib.h>
41 #include <sys/types.h>
42 #include <sys/stat.h>
43 #include <math.h>
44 #include <ctype.h>
45 
46 #include "cram/cram.h"
47 #include "cram/os.h"
48 #include "cram/md5.h"
49 
50 #ifdef SAMTOOLS
51 # define bam_copy(dst, src) bam_copy1(*(dst), (src))
52 #else
53 void bam_copy(bam_seq_t **bt, bam_seq_t *bf) {
54  size_t a;
55 
56  if (bf->alloc > (*bt)->alloc) {
57  a = ((int)((bf->alloc+15)/16))*16;
58  *bt = realloc(*bt, a);
59  memcpy(*bt, bf, bf->alloc);
60  } else {
61  a = (*bt)->alloc;
62  memcpy(*bt, bf, bf->alloc);
63  }
64 
65  (*bt)->alloc = a;
66 }
67 #endif
68 
69 #define Z_CRAM_STRAT Z_FILTERED
70 //#define Z_CRAM_STRAT Z_RLE
71 //#define Z_CRAM_STRAT Z_HUFFMAN_ONLY
72 //#define Z_CRAM_STRAT Z_DEFAULT_STRATEGY
73 
74 static int process_one_read(cram_fd *fd, cram_container *c,
75  cram_slice *s, cram_record *cr,
76  bam_seq_t *b, int rnum);
77 
78 /*
79  * Returns index of val into key.
80  * Basically strchr(key, val)-key;
81  */
82 static int sub_idx(char *key, char val) {
83  int i;
84 
85  for (i = 0; *key && *key++ != val; i++);
86  return i;
87 }
88 
89 /*
90  * Encodes a compression header block into a generic cram_block structure.
91  *
92  * Returns cram_block ptr on success
93  * NULL on failure
94  */
99  int i, mc;
100 
101  if (!cb || !map)
102  return NULL;
103 
104  /*
105  * This is a concatenation of several blocks of data:
106  * header + landmarks, preservation map, read encoding map, and the tag
107  * encoding map.
108  * All 4 are variable sized and we need to know how large these are
109  * before creating the compression header itself as this starts with
110  * the total size (stored as a variable length string).
111  */
112 
113  // Duplicated from container itself, and removed in 1.1
114  if (fd->version == CRAM_1_VERS) {
115  itf8_put_blk(cb, h->ref_seq_id);
116  itf8_put_blk(cb, h->ref_seq_start);
117  itf8_put_blk(cb, h->ref_seq_span);
118  itf8_put_blk(cb, h->num_records);
119  itf8_put_blk(cb, h->num_landmarks);
120  for (i = 0; i < h->num_landmarks; i++) {
121  itf8_put_blk(cb, h->landmark[i]);
122  }
123  }
124 
125  /* Create in-memory preservation map */
126  /* FIXME: should create this when we create the container */
127  {
128  khint_t k;
129  int r;
130 
131  if (!(h->preservation_map = kh_init(map)))
132  return NULL;
133 
134  k = kh_put(map, h->preservation_map, "RN", &r);
135  if (-1 == r) return NULL;
136  kh_val(h->preservation_map, k).i = 1;
137 
138  if (fd->version == CRAM_1_VERS) {
139  k = kh_put(map, h->preservation_map, "PI", &r);
140  if (-1 == r) return NULL;
141  kh_val(h->preservation_map, k).i = 0;
142 
143  k = kh_put(map, h->preservation_map, "UI", &r);
144  if (-1 == r) return NULL;
145  kh_val(h->preservation_map, k).i = 1;
146 
147  k = kh_put(map, h->preservation_map, "MI", &r);
148  if (-1 == r) return NULL;
149  kh_val(h->preservation_map, k).i = 1;
150 
151  } else {
152  // Technically SM was in 1.0, but wasn't in Java impl.
153  k = kh_put(map, h->preservation_map, "SM", &r);
154  if (-1 == r) return NULL;
155  kh_val(h->preservation_map, k).i = 0;
156 
157  k = kh_put(map, h->preservation_map, "TD", &r);
158  if (-1 == r) return NULL;
159  kh_val(h->preservation_map, k).i = 0;
160 
161  k = kh_put(map, h->preservation_map, "AP", &r);
162  if (-1 == r) return NULL;
163  kh_val(h->preservation_map, k).i = c->pos_sorted;
164 
165  if (fd->no_ref || fd->embed_ref) {
166  // Reference Required == No
167  k = kh_put(map, h->preservation_map, "RR", &r);
168  if (-1 == r) return NULL;
169  kh_val(h->preservation_map, k).i = 0;
170  }
171  }
172  }
173 
174  /* Encode preservation map; could collapse this and above into one */
175  mc = 0;
176  BLOCK_SIZE(map) = 0;
177  if (h->preservation_map) {
178  khint_t k;
179 
180  for (k = kh_begin(h->preservation_map);
181  k != kh_end(h->preservation_map);
182  k++) {
183  const char *key;
184  khash_t(map) *pmap = h->preservation_map;
185 
186 
187  if (!kh_exist(pmap, k))
188  continue;
189 
190  key = kh_key(pmap, k);
191  BLOCK_APPEND(map, key, 2);
192 
193  switch(CRAM_KEY(key[0], key[1])) {
194  case CRAM_KEY('M','I'):
195  BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i);
196  break;
197 
198  case CRAM_KEY('U','I'):
199  BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i);
200  break;
201 
202  case CRAM_KEY('P','I'):
203  BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i);
204  break;
205 
206  case CRAM_KEY('A','P'):
207  BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i);
208  break;
209 
210  case CRAM_KEY('R','N'):
211  BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i);
212  break;
213 
214  case CRAM_KEY('R','R'):
215  BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i);
216  break;
217 
218  case CRAM_KEY('S','M'): {
219  char smat[5], *mp = smat;
220  *mp++ =
221  (sub_idx("CGTN", h->substitution_matrix[0][0]) << 6) |
222  (sub_idx("CGTN", h->substitution_matrix[0][1]) << 4) |
223  (sub_idx("CGTN", h->substitution_matrix[0][2]) << 2) |
224  (sub_idx("CGTN", h->substitution_matrix[0][3]) << 0);
225  *mp++ =
226  (sub_idx("AGTN", h->substitution_matrix[1][0]) << 6) |
227  (sub_idx("AGTN", h->substitution_matrix[1][1]) << 4) |
228  (sub_idx("AGTN", h->substitution_matrix[1][2]) << 2) |
229  (sub_idx("AGTN", h->substitution_matrix[1][3]) << 0);
230  *mp++ =
231  (sub_idx("ACTN", h->substitution_matrix[2][0]) << 6) |
232  (sub_idx("ACTN", h->substitution_matrix[2][1]) << 4) |
233  (sub_idx("ACTN", h->substitution_matrix[2][2]) << 2) |
234  (sub_idx("ACTN", h->substitution_matrix[2][3]) << 0);
235  *mp++ =
236  (sub_idx("ACGN", h->substitution_matrix[3][0]) << 6) |
237  (sub_idx("ACGN", h->substitution_matrix[3][1]) << 4) |
238  (sub_idx("ACGN", h->substitution_matrix[3][2]) << 2) |
239  (sub_idx("ACGN", h->substitution_matrix[3][3]) << 0);
240  *mp++ =
241  (sub_idx("ACGT", h->substitution_matrix[4][0]) << 6) |
242  (sub_idx("ACGT", h->substitution_matrix[4][1]) << 4) |
243  (sub_idx("ACGT", h->substitution_matrix[4][2]) << 2) |
244  (sub_idx("ACGT", h->substitution_matrix[4][3]) << 0);
245  BLOCK_APPEND(map, smat, 5);
246  break;
247  }
248 
249  case CRAM_KEY('T','D'): {
250  itf8_put_blk(map, BLOCK_SIZE(h->TD_blk));
251  BLOCK_APPEND(map,
252  BLOCK_DATA(h->TD_blk),
253  BLOCK_SIZE(h->TD_blk));
254  break;
255  }
256 
257  default:
258  fprintf(stderr, "Unknown preservation key '%.2s'\n", key);
259  break;
260  }
261 
262  mc++;
263  }
264  }
265  itf8_put_blk(cb, BLOCK_SIZE(map) + itf8_size(mc));
266  itf8_put_blk(cb, mc);
267  BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map));
268 
269  /* rec encoding map */
270  mc = 0;
271  BLOCK_SIZE(map) = 0;
272  if (h->BF_codec) {
273  if (-1 == h->BF_codec->store(h->BF_codec, map, "BF", fd->version))
274  return NULL;
275  mc++;
276  }
277  if (h->CF_codec) {
278  if (-1 == h->CF_codec->store(h->CF_codec, map, "CF", fd->version))
279  return NULL;
280  mc++;
281  }
282  if (h->RL_codec) {
283  if (-1 == h->RL_codec->store(h->RL_codec, map, "RL", fd->version))
284  return NULL;
285  mc++;
286  }
287  if (h->AP_codec) {
288  if (-1 == h->AP_codec->store(h->AP_codec, map, "AP", fd->version))
289  return NULL;
290  mc++;
291  }
292  if (h->RG_codec) {
293  if (-1 == h->RG_codec->store(h->RG_codec, map, "RG", fd->version))
294  return NULL;
295  mc++;
296  }
297  if (h->MF_codec) {
298  if (-1 == h->MF_codec->store(h->MF_codec, map, "MF", fd->version))
299  return NULL;
300  mc++;
301  }
302  if (h->NS_codec) {
303  if (-1 == h->NS_codec->store(h->NS_codec, map, "NS", fd->version))
304  return NULL;
305  mc++;
306  }
307  if (h->NP_codec) {
308  if (-1 == h->NP_codec->store(h->NP_codec, map, "NP", fd->version))
309  return NULL;
310  mc++;
311  }
312  if (h->TS_codec) {
313  if (-1 == h->TS_codec->store(h->TS_codec, map, "TS", fd->version))
314  return NULL;
315  mc++;
316  }
317  if (h->NF_codec) {
318  if (-1 == h->NF_codec->store(h->NF_codec, map, "NF", fd->version))
319  return NULL;
320  mc++;
321  }
322  if (h->TC_codec) {
323  if (-1 == h->TC_codec->store(h->TC_codec, map, "TC", fd->version))
324  return NULL;
325  mc++;
326  }
327  if (h->TN_codec) {
328  if (-1 == h->TN_codec->store(h->TN_codec, map, "TN", fd->version))
329  return NULL;
330  mc++;
331  }
332  if (h->TL_codec) {
333  if (-1 == h->TL_codec->store(h->TL_codec, map, "TL", fd->version))
334  return NULL;
335  mc++;
336  }
337  if (h->FN_codec) {
338  if (-1 == h->FN_codec->store(h->FN_codec, map, "FN", fd->version))
339  return NULL;
340  mc++;
341  }
342  if (h->FC_codec) {
343  if (-1 == h->FC_codec->store(h->FC_codec, map, "FC", fd->version))
344  return NULL;
345  mc++;
346  }
347  if (h->FP_codec) {
348  if (-1 == h->FP_codec->store(h->FP_codec, map, "FP", fd->version))
349  return NULL;
350  mc++;
351  }
352  if (h->BS_codec) {
353  if (-1 == h->BS_codec->store(h->BS_codec, map, "BS", fd->version))
354  return NULL;
355  mc++;
356  }
357  if (h->IN_codec) {
358  if (-1 == h->IN_codec->store(h->IN_codec, map, "IN", fd->version))
359  return NULL;
360  mc++;
361  }
362  if (h->DL_codec) {
363  if (-1 == h->DL_codec->store(h->DL_codec, map, "DL", fd->version))
364  return NULL;
365  mc++;
366  }
367  if (h->BA_codec) {
368  if (-1 == h->BA_codec->store(h->BA_codec, map, "BA", fd->version))
369  return NULL;
370  mc++;
371  }
372  if (h->MQ_codec) {
373  if (-1 == h->MQ_codec->store(h->MQ_codec, map, "MQ", fd->version))
374  return NULL;
375  mc++;
376  }
377  if (h->RN_codec) {
378  if (-1 == h->RN_codec->store(h->RN_codec, map, "RN", fd->version))
379  return NULL;
380  mc++;
381  }
382  if (h->QS_codec) {
383  if (-1 == h->QS_codec->store(h->QS_codec, map, "QS", fd->version))
384  return NULL;
385  mc++;
386  }
387  if (h->Qs_codec) {
388  if (-1 == h->Qs_codec->store(h->Qs_codec, map, "Qs", fd->version))
389  return NULL;
390  mc++;
391  }
392  if (h->RI_codec) {
393  if (-1 == h->RI_codec->store(h->RI_codec, map, "RI", fd->version))
394  return NULL;
395  mc++;
396  }
397  if (fd->version != CRAM_1_VERS) {
398  if (h->SC_codec) {
399  if (-1 == h->SC_codec->store(h->SC_codec, map, "SC", fd->version))
400  return NULL;
401  mc++;
402  }
403  if (h->RS_codec) {
404  if (-1 == h->RS_codec->store(h->RS_codec, map, "RS", fd->version))
405  return NULL;
406  mc++;
407  }
408  if (h->PD_codec) {
409  if (-1 == h->PD_codec->store(h->PD_codec, map, "PD", fd->version))
410  return NULL;
411  mc++;
412  }
413  if (h->HC_codec) {
414  if (-1 == h->HC_codec->store(h->HC_codec, map, "HC", fd->version))
415  return NULL;
416  mc++;
417  }
418  }
419  if (h->TM_codec) {
420  if (-1 == h->TM_codec->store(h->TM_codec, map, "TM", fd->version))
421  return NULL;
422  mc++;
423  }
424  if (h->TV_codec) {
425  if (-1 == h->TV_codec->store(h->TV_codec, map, "TV", fd->version))
426  return NULL;
427  mc++;
428  }
429  itf8_put_blk(cb, BLOCK_SIZE(map) + itf8_size(mc));
430  itf8_put_blk(cb, mc);
431  BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map));
432 
433  /* tag encoding map */
434 #if 0
435  mp = map; mc = 0;
436  if (h->tag_encoding_map) {
437  HashItem *hi;
438  HashIter *iter = HashTableIterCreate();
439  if (!iter)
440  return NULL;
441 
442  while ((hi = HashTableIterNext(h->tag_encoding_map, iter))) {
443  cram_map *m = hi->data.p;
444  int sz;
445 
446  mp += itf8_put(mp, (hi->key[0]<<16)|(hi->key[1]<<8)|hi->key[2]);
447  if (-1 == (sz = m->codec->store(m->codec, mp, NULL, fd->version)))
448  return NULL;
449  mp += sz;
450  mc++;
451  }
452 
453  HashTableIterDestroy(iter);
454  }
455 #else
456  mc = 0;
457  BLOCK_SIZE(map) = 0;
458  if (c->tags_used) {
459  khint_t k;
460 
461  for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) {
462  if (!kh_exist(c->tags_used, k))
463  continue;
464 
465  mc++;
466  itf8_put_blk(map, kh_key(c->tags_used, k));
467 
468  // use block content id 4
469  switch(kh_key(c->tags_used, k) & 0xff) {
470  case 'Z': case 'H':
471  // string as byte_array_stop
472  if (fd->version == CRAM_1_VERS) {
473  BLOCK_APPEND(map,
474  "\005" // BYTE_ARRAY_STOP
475  "\005" // len
476  "\t" // stop-byte is also SAM separator
477  CRAM_EXT_TAG_S "\000\000\000",
478  7);
479  } else {
480  BLOCK_APPEND(map,
481  "\005" // BYTE_ARRAY_STOP
482  "\002" // len
483  "\t" // stop-byte is also SAM separator
485  4);
486  }
487  break;
488 
489  case 'A': case 'c': case 'C':
490  // byte array len, 1 byte
491  BLOCK_APPEND(map,
492  "\004" // BYTE_ARRAY_LEN
493  "\011" // length
494  "\003" // HUFFMAN (len)
495  "\004" // huffman-len
496  "\001" // 1 symbol
497  "\001" // symbol=1 byte value
498  "\001" // 1 length
499  "\000" // length=0
500  "\001" // EXTERNAL (val)
501  "\001" // external-len
502  CRAM_EXT_TAG_S,// content-id
503  11);
504  break;
505 
506  case 's': case 'S':
507  // byte array len, 2 byte
508  BLOCK_APPEND(map,
509  "\004" // BYTE_ARRAY_LEN
510  "\011" // length
511  "\003" // HUFFMAN (len)
512  "\004" // huffman-len
513  "\001" // 1 symbol
514  "\002" // symbol=2 byte value
515  "\001" // 1 length
516  "\000" // length=0
517  "\001" // EXTERNAL (val)
518  "\001" // external-len
519  CRAM_EXT_TAG_S,// content-id
520  11);
521  break;
522 
523  case 'i': case 'I': case 'f':
524  // byte array len, 4 byte
525  BLOCK_APPEND(map,
526  "\004" // BYTE_ARRAY_LEN
527  "\011" // length
528  "\003" // HUFFMAN (len)
529  "\004" // huffman-len
530  "\001" // 1 symbol
531  "\004" // symbol=4 byte value
532  "\001" // 1 length
533  "\000" // length=0
534  "\001" // EXTERNAL (val)
535  "\001" // external-len
536  CRAM_EXT_TAG_S,// content-id
537  11);
538  break;
539 
540  case 'B':
541  // Byte array of variable size, but we generate our tag
542  // byte stream at the wrong stage (during reading and not
543  // after slice header construction). So we use
544  // BYTE_ARRAY_LEN with the length codec being external
545  // too.
546  BLOCK_APPEND(map,
547  "\004" // BYTE_ARRAY_LEN
548  "\006" // length
549  "\001" // EXTERNAL (len)
550  "\001" // external-len
551  "\004" // content-id
552  "\001" // EXTERNAL (val)
553  "\001" // external-len
554  CRAM_EXT_TAG_S,// content-id
555  8);
556  break;
557 
558  default:
559  fprintf(stderr, "Unsupported SAM aux type '%c'\n",
560  kh_key(c->tags_used, k) & 0xff);
561  }
562  //mp += m->codec->store(m->codec, mp, NULL, fd->version);
563  }
564  }
565 #endif
566  itf8_put_blk(cb, BLOCK_SIZE(map) + itf8_size(mc));
567  itf8_put_blk(cb, mc);
568  BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map));
569 
570  if (fd->verbose)
571  fprintf(stderr, "Wrote compression block header in %d bytes\n",
572  (int)BLOCK_SIZE(cb));
573 
574  BLOCK_UPLEN(cb);
575 
576  cram_free_block(map);
577 
578  return cb;
579 }
580 
581 
582 /*
583  * Encodes a slice compression header.
584  *
585  * Returns cram_block on success
586  * NULL on failure
587  */
589  char *buf;
590  char *cp;
592  int j;
593 
594  if (!b)
595  return NULL;
596 
597  if (NULL == (cp = buf = malloc(16+5*(8+s->hdr->num_blocks)))) {
598  cram_free_block(b);
599  return NULL;
600  }
601 
602  cp += itf8_put(cp, s->hdr->ref_seq_id);
603  cp += itf8_put(cp, s->hdr->ref_seq_start);
604  cp += itf8_put(cp, s->hdr->ref_seq_span);
605  cp += itf8_put(cp, s->hdr->num_records);
606  if (fd->version != CRAM_1_VERS)
607  cp += itf8_put(cp, s->hdr->record_counter);
608  cp += itf8_put(cp, s->hdr->num_blocks);
609  cp += itf8_put(cp, s->hdr->num_content_ids);
610  for (j = 0; j < s->hdr->num_content_ids; j++) {
611  cp += itf8_put(cp, s->hdr->block_content_ids[j]);
612  }
613  if (s->hdr->content_type == MAPPED_SLICE)
614  cp += itf8_put(cp, s->hdr->ref_base_id);
615 
616  if (fd->version != CRAM_1_VERS) {
617  memcpy(cp, s->hdr->md5, 16); cp += 16;
618  }
619 
620  assert(cp-buf <= 16+5*(8+s->hdr->num_blocks));
621 
622  b->data = (unsigned char *)buf;
623  b->comp_size = b->uncomp_size = cp-buf;
624 
625  return b;
626 }
627 
628 
629 /*
630  * Encodes a single slice from a container
631  * FIXME: break into smaller components.
632  *
633  * Returns 0 on success
634  * -1 on failure
635  */
636 static int cram_encode_slice(cram_fd *fd, cram_container *c,
638  int rec, r = 0, last_pos;
639  cram_block *core;
640  int nblk, embed_ref;
641 
642  embed_ref = fd->embed_ref && s->hdr->ref_seq_id != -1 ? 1 : 0;
643 
644  /*
645  * Slice external blocks:
646  * ID 0 => base calls (insertions, soft-clip)
647  * ID 1 => qualities
648  * ID 2 => names
649  * ID 3 => TS (insert size), NP (next frag)
650  * ID 4 => tag values
651  * ID 5 => BA, ifdef BA_external
652  * ID 6 => tag IDs (TN), ifdef TN_external and CRAM_1_VERS
653  * ID 7 => TD tag dictionary, if !CRAM_1_VERS
654  */
655 
656  /* Create cram slice header, num_blocks etc */
657  s->hdr->ref_base_id = embed_ref ? CRAM_EXT_REF : -1;
659  c->num_records += s->hdr->num_records;
660  nblk = (fd->version == CRAM_1_VERS) ? 5 : 6;
661 #ifdef BA_external
662  nblk++;
663 #endif
664 #ifdef TN_external
665  if (fd->version == CRAM_1_VERS) {
666  nblk++;
667  }
668 #endif
669  if (embed_ref)
670  nblk++;
671 
672  s->hdr->num_content_ids = nblk;
673  s->hdr->num_blocks = s->hdr->num_content_ids+1;
674  s->block = calloc(s->hdr->num_blocks, sizeof(s->block[0]));
675  s->hdr->block_content_ids = malloc(s->hdr->num_content_ids *
676  sizeof(int32_t));
677  if (!s->block || !s->hdr->block_content_ids)
678  return -1;
679  s->hdr->block_content_ids[0] = 0; // core
685  nblk = (fd->version == CRAM_1_VERS) ? 5 : 6;
686 #ifdef BA_external
687  s->hdr->block_content_ids[(s->ba_id = ++nblk)-1] = CRAM_EXT_BA;
688 #endif
689 #ifdef TN_external
690  if (fd->version == CRAM_1_VERS) {
691  s->hdr->block_content_ids[(s->tn_id = ++nblk)-1] = CRAM_EXT_TN;
692  }
693 #endif
694  if (embed_ref)
695  s->hdr->block_content_ids[(s->ref_id = ++nblk)-1] = CRAM_EXT_REF;
696 
697  if (!(s->block[0] = cram_new_block(CORE, 0))) return -1;
698  if (!(s->block[1] = cram_new_block(EXTERNAL, CRAM_EXT_IN))) return -1;
699  if (!(s->block[2] = cram_new_block(EXTERNAL, CRAM_EXT_QUAL))) return -1;
700  if (!(s->block[3] = cram_new_block(EXTERNAL, CRAM_EXT_NAME))) return -1;
701  if (!(s->block[4] = cram_new_block(EXTERNAL, CRAM_EXT_TS_NP))) return -1;
702  if (!(s->block[5] = cram_new_block(EXTERNAL, CRAM_EXT_TAG))) return -1;
703  if (fd->version != CRAM_1_VERS) {
704  if (!(s->block[6] = cram_new_block(EXTERNAL, CRAM_EXT_SC)))
705  return -1;
706  }
707 #ifdef BA_external
708  if (!(s->block[s->ba_id] = cram_new_block(EXTERNAL, CRAM_EXT_BA)))
709  return -1;
710 #endif
711 #ifdef TN_external
712  if (fd->version == CRAM_1_VERS) {
713  if (!(s->block[s->tn_id] = cram_new_block(EXTERNAL, CRAM_EXT_TN)))
714  return -1;
715  }
716 #endif
717  if (embed_ref) {
718  if (!(s->block[s->ref_id] = cram_new_block(EXTERNAL, CRAM_EXT_REF)))
719  return -1;
720  BLOCK_APPEND(s->block[s->ref_id],
721  c->ref + c->first_base - c->ref_start,
722  c->last_base - c->first_base + 1);
723  }
724 
725  core = s->block[0];
726 
727  /* Create a formal method for stealing from dstrings! */
728  s->block[4]->data = calloc(10, s->hdr->num_records); // NP TS
729  if (!s->block[4]->data)
730  return -1;
731  s->block[4]->comp_size = s->block[4]->uncomp_size = 0;
732 
733 #ifdef BA_external
734  s->block[s->ba_id]->data = calloc(1, s->BA_len);
735  if (!s->block[s->ba_id]->data)
736  return -1;
737  s->block[s->ba_id]->comp_size = s->block[s->ba_id]->uncomp_size = 0;
738 #endif
739 
740  /* Generate core block */
741  if (!(s->hdr_block = cram_encode_slice_header(fd, s)))
742  return -1;
743 
744  last_pos = s->hdr->ref_seq_start;
745  for (rec = 0; rec < s->hdr->num_records; rec++) {
746  cram_record *cr = &s->crecs[rec];
747  int32_t i32;
748  unsigned char uc;
749 
750  //fprintf(stderr, "Encode seq %d, %d/%d FN=%d, %s\n", rec, core->byte, core->bit, cr->nfeature, s->name_ds->str + cr->name);
751 
752  //printf("BF=0x%x\n", cr->flags);
753  // bf = cram_flag_swap[cr->flags];
754  i32 = fd->cram_flag_swap[cr->flags & 0xfff];
755  r |= h->BF_codec->encode(s, h->BF_codec, core, (char *)&i32, 1);
756 
757  i32 = cr->cram_flags;
758  r |= h->CF_codec->encode(s, h->CF_codec, core,
759  (char *)&i32, 1);
760 
761  if (fd->version != CRAM_1_VERS)
762  r |= h->RI_codec->encode(s, h->RI_codec, core,
763  (char *)&cr->ref_id, 1);
764 
765  r |= h->RL_codec->encode(s, h->RL_codec, core,
766  (char *)&cr->len, 1);
767 
768  if (c->pos_sorted) {
769  i32 = cr->apos - last_pos;
770  r |= h->AP_codec->encode(s, h->AP_codec, core, (char *)&i32, 1);
771  last_pos = cr->apos;
772  } else {
773  i32 = cr->apos;
774  r |= h->AP_codec->encode(s, h->AP_codec, core, (char *)&i32, 1);
775  }
776 
777  r |= h->RG_codec->encode(s, h->RG_codec, core,
778  (char *)&cr->rg, 1);
779 
780  if (c->comp_hdr->read_names_included) {
781  // RN codec: Already stored in block[3].
782  }
783 
784  if (cr->cram_flags & CRAM_FLAG_DETACHED) {
785  i32 = cr->mate_flags;
786  r |= h->MF_codec->encode(s, h->MF_codec, core, (char *)&i32, 1);
787 
788  if (!c->comp_hdr->read_names_included) {
789  // RN codec: Already stored in block[3].
790  }
791 
792 #ifndef NS_external
793  r |= h->NS_codec->encode(s, h->NS_codec, core,
794  (char *)&cr->mate_ref_id, 1);
795 #else
796  s->block[4]->uncomp_size +=
797  itf8_put(&s->block[4]->data[s->block[4]->uncomp_size],
798  cr->mate_ref_id);
799 #endif
800 
801 #ifndef TS_external
802  r |= h->NP_codec->encode(s, h->NP_codec, core,
803  (char *)&cr->mate_pos, 1);
804 
805  r |= h->TS_codec->encode(s, h->TS_codec, core,
806  (char *)&cr->tlen, 1);
807 #else
808  s->block[4]->uncomp_size +=
809  itf8_put((char *)&s->block[4]->data[s->block[4]->uncomp_size],
810  cr->mate_pos);
811  s->block[4]->uncomp_size +=
812  itf8_put((char *)&s->block[4]->data[s->block[4]->uncomp_size],
813  cr->tlen);
814 #endif
815  } else if (cr->cram_flags & CRAM_FLAG_MATE_DOWNSTREAM) {
816  r |= h->NF_codec->encode(s, h->NF_codec, core,
817  (char *)&cr->mate_line, 1);
818  }
819 
820  /* Aux tags */
821  if (fd->version == CRAM_1_VERS) {
822  uc = cr->ntags;
823  r |= h->TC_codec->encode(s, h->TC_codec, core, (char *)&uc, 1);
824 #ifndef TN_external
825  {
826  int j;
827  for (j = 0; j < cr->ntags; j++) {
828  uint32_t i32 = s->TN[cr->TN_idx + j]; // id
829  r |= h->TN_codec->encode(s, h->TN_codec, core,
830  (char *)&i32, 1);
831  }
832  }
833 #endif
834  } else {
835  r |= h->TL_codec->encode(s, h->TL_codec, core, (char *)&cr->TL, 1);
836  }
837 
838  // qual
839  // QS codec : Already stored in block[2].
840 
841  // features (diffs)
842  if (!(cr->flags & BAM_FUNMAP)) {
843  int prev_pos = 0, j;
844 
845  r |= h->FN_codec->encode(s, h->FN_codec, core,
846  (char *)&cr->nfeature, 1);
847  for (j = 0; j < cr->nfeature; j++) {
848  cram_feature *f = &s->features[cr->feature + j];
849 
850  uc = f->X.code;
851  r |= h->FC_codec->encode(s, h->FC_codec, core,
852  (char *)&uc, 1);
853  i32 = f->X.pos - prev_pos;
854  r |= h->FP_codec->encode(s, h->FP_codec, core,
855  (char *)&i32, 1);
856  prev_pos = f->X.pos;
857 
858  switch(f->X.code) {
859  //char *seq;
860 
861  case 'X':
862  //fprintf(stderr, " FC=%c FP=%d base=%d\n", f->X.code, i32, f->X.base);
863 
864  uc = f->X.base;
865  r |= h->BS_codec->encode(s, h->BS_codec, core,
866  (char *)&uc, 1);
867  break;
868  case 'S':
869  //seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx;
870  //r |= h->SC_codec->encode(s, h->SC_codec, core,
871  // seq, f->S.len);
872  break;
873  case 'I':
874  //seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx;
875  //r |= h->IN_codec->encode(s, h->IN_codec, core,
876  // seq, f->S.len);
877  break;
878  case 'i':
879  uc = f->i.base;
880 #ifdef BA_external
881  s->block[s->ba_id]->data[s->block[s->ba_id]->uncomp_size++] = uc;
882 #else
883  r |= h->BA_codec->encode(s, h->BA_codec, core,
884  (char *)&uc, 1);
885 #endif
886  //seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx;
887  //r |= h->IN_codec->encode(s, h->IN_codec, core,
888  // seq, 1);
889  break;
890  case 'D':
891  i32 = f->D.len;
892  r |= h->DL_codec->encode(s, h->DL_codec, core,
893  (char *)&i32, 1);
894  break;
895 
896  case 'B':
897 // // Used when we try to store a non ACGTN base or an N
898 // // that aligns against a non ACGTN reference
899 
900  uc = f->B.base;
901 #ifdef BA_external
902  s->block[s->ba_id]->data[s->block[s->ba_id]->uncomp_size++] = uc;
903 #else
904  r |= h->BA_codec->encode(s, h->BA_codec, core,
905  (char *)&uc, 1);
906 #endif
907 
908 // Already added
909 // uc = f->B.qual;
910 // r |= h->QS_codec->encode(s, h->QS_codec, core,
911 // (char *)&uc, 1);
912  break;
913 
914  case 'Q':
915 // Already added
916 // uc = f->B.qual;
917 // r |= h->QS_codec->encode(s, h->QS_codec, core,
918 // (char *)&uc, 1);
919  break;
920 
921  case 'N':
922  i32 = f->N.len;
923  r |= h->RS_codec->encode(s, h->RS_codec, core,
924  (char *)&i32, 1);
925  break;
926 
927  case 'P':
928  i32 = f->P.len;
929  r |= h->PD_codec->encode(s, h->PD_codec, core,
930  (char *)&i32, 1);
931  break;
932 
933  case 'H':
934  i32 = f->H.len;
935  r |= h->HC_codec->encode(s, h->HC_codec, core,
936  (char *)&i32, 1);
937  break;
938 
939 
940  default:
941  fprintf(stderr, "unhandled feature code %c\n",
942  f->X.code);
943  return -1;
944  }
945  }
946 
947  r |= h->MQ_codec->encode(s, h->MQ_codec, core,
948  (char *)&cr->mqual, 1);
949  } else {
950  char *seq = (char *)BLOCK_DATA(s->seqs_blk) + cr->seq;
951 #ifdef BA_external
952  memcpy(&s->block[s->ba_id]->data[s->block[s->ba_id]->uncomp_size],
953  seq, cr->len);
954  s->block[s->ba_id]->uncomp_size += cr->len;
955 #else
956  r |= h->BA_codec->encode(s, h->BA_codec, core, seq, cr->len);
957 #endif
958  }
959 
960  if (r)
961  return -1;
962  }
963  s->block[0]->uncomp_size = s->block[0]->byte + (s->block[0]->bit < 7);
964  s->block[0]->comp_size = s->block[0]->uncomp_size;
965 
966  // FIXME: we should avoid creating these in the first place and just
967  // point them to s->base_blk et al.
968  cram_free_block(s->block[1]);
969  cram_free_block(s->block[2]);
970  cram_free_block(s->block[3]);
971  cram_free_block(s->block[5]);
972  if (fd->version != CRAM_1_VERS) {
973  cram_free_block(s->block[6]);
974  BLOCK_UPLEN(s->soft_blk);
975  s->block[6] = s->soft_blk;
976  s->soft_blk = NULL;
977  }
978  BLOCK_UPLEN(s->base_blk); s->block[1] = s->base_blk; s->base_blk = NULL;
979  BLOCK_UPLEN(s->qual_blk); s->block[2] = s->qual_blk; s->qual_blk = NULL;
980  BLOCK_UPLEN(s->name_blk); s->block[3] = s->name_blk; s->name_blk = NULL;
981  BLOCK_UPLEN(s->aux_blk); s->block[5] = s->aux_blk; s->aux_blk = NULL;
982 
983 #ifdef TN_external
984  if (fd->version == CRAM_1_VERS) {
985  cram_free_block(s->block[s->tn_id]);
986  BLOCK_UPLEN(s->tn_blk); s->block[s->tn_id] = s->tn_blk;
987  s->tn_blk = NULL;
988  }
989 #endif
990 
991  s->block[4]->comp_size = s->block[4]->uncomp_size;
992 
993 #ifdef BA_external
994  s->block[s->ba_id]->comp_size = s->block[s->ba_id]->uncomp_size;
995 #endif
996 
997  /* Compress the CORE Block too, with minimal zlib level */
998  if (fd->level > 5)
999  cram_compress_block(fd, s->block[0], NULL, 1, Z_CRAM_STRAT, -1, -1);
1000 
1001 #define USE_METRICS
1002 
1003 #ifdef USE_METRICS
1004 # define LEVEL2 1
1005 # define STRAT2 Z_RLE
1006 #else
1007 # define LEVEL2 -1
1008 # define STRAT2 -1
1009 #endif
1010 
1011  /* Compress the other blocks */
1012  if (cram_compress_block(fd, s->block[1], NULL, //IN (seq)
1013  fd->level, Z_CRAM_STRAT,
1014  -1, -1))
1015  return -1;
1016 
1017  if (fd->level == 0) {
1018  /* Do nothing */
1019  } else if (fd->level == 1) {
1020  if (cram_compress_block(fd, s->block[2], fd->m[1], //qual
1021  1, Z_RLE, -1, -1))
1022  return -1;
1023  if (cram_compress_block(fd, s->block[5], fd->m[4], //Tags
1024  1, Z_RLE, -1, -1))
1025  return -1;
1026  } else if (fd->level < 3) {
1027  if (cram_compress_block(fd, s->block[2], fd->m[1], //qual
1028  1, Z_RLE,
1029  1, Z_HUFFMAN_ONLY))
1030  return -1;
1031  if (cram_compress_block(fd, s->block[5], fd->m[4], //Tags
1032  1, Z_RLE,
1033  1, Z_HUFFMAN_ONLY))
1034  return -1;
1035  } else {
1036  if (cram_compress_block(fd, s->block[2], fd->m[1], //qual
1037  fd->level, Z_CRAM_STRAT,
1038  LEVEL2, STRAT2))
1039  return -1;
1040  if (cram_compress_block(fd, s->block[5], fd->m[4], //Tags
1041  fd->level, Z_CRAM_STRAT,
1042  LEVEL2, STRAT2))
1043  return -1;
1044  }
1045  if (cram_compress_block(fd, s->block[3], NULL, //Name
1046  fd->level, Z_CRAM_STRAT,
1047  -1, -1))
1048  return -1;
1049  if (cram_compress_block(fd, s->block[4], NULL, //TS, NP
1050  fd->level, Z_CRAM_STRAT,
1051  -1, -1))
1052  return -1;
1053  if (fd->version != CRAM_1_VERS) {
1054  if (cram_compress_block(fd, s->block[6], NULL, //SC (seq)
1055  fd->level, Z_CRAM_STRAT,
1056  -1, -1))
1057  return -1;
1058  }
1059 #ifdef BA_external
1060  if (cram_compress_block(fd, s->block[s->ba_id], NULL,
1061  fd->level, Z_CRAM_STRAT, -1, -1))
1062  return -1;
1063 #endif
1064 #ifdef TN_external
1065  if (fd->version == CRAM_1_VERS) {
1066  if (cram_compress_block(fd, s->block[s->tn_id], NULL,
1067  fd->level, Z_DEFAULT_STRATEGY, -1, -1))
1068  return -1;
1069  }
1070 #endif
1071  if (embed_ref) {
1072  BLOCK_UPLEN(s->block[s->ref_id]);
1073  if (cram_compress_block(fd, s->block[s->ref_id], NULL,
1074  fd->level, Z_DEFAULT_STRATEGY, -1, -1))
1075  return -1;
1076  }
1077 
1078  return r ? -1 : 0;
1079 }
1080 
1081 /*
1082  * Encodes all slices in a container into blocks.
1083  * Returns 0 on success
1084  * -1 on failure
1085  */
1087  int i, j, slice_offset;
1089  cram_block *c_hdr;
1090  int multi_ref = 0;
1091  int r1, r2, sn, nref;
1092  spare_bams *spares;
1093 
1094  /* Cache references up-front if we have unsorted access patterns */
1095  pthread_mutex_lock(&fd->ref_lock);
1096  nref = fd->refs->nref;
1097  pthread_mutex_unlock(&fd->ref_lock);
1098 
1099  if (c->refs_used) {
1100  for (i = 0; i < nref; i++) {
1101  if (c->refs_used[i]) {
1102  cram_get_ref(fd, i, 1, 0);
1103  }
1104  }
1105  }
1106 
1107  /* Fetch reference sequence */
1108  if (!fd->no_ref) {
1109  bam_seq_t *b = c->bams[0];
1110  char *ref;
1111 
1112  ref = cram_get_ref(fd, bam_ref(b), 1, 0);
1113  if (!ref && bam_ref(b) >= 0) {
1114  fprintf(stderr, "Failed to load reference #%d\n", bam_ref(b));
1115  return -1;
1116  }
1117  if ((c->ref_id = bam_ref(b)) >= 0) {
1118  c->ref_seq_id = c->ref_id;
1119  c->ref = fd->refs->ref_id[c->ref_seq_id]->seq;
1120  c->ref_start = 1;
1121  c->ref_end = fd->refs->ref_id[c->ref_seq_id]->length;
1122  } else {
1123  c->ref_seq_id = c->ref_id; // FIXME remove one var!
1124  }
1125  } else {
1126  c->ref_seq_id = c->ref_id; // FIXME remove one var!
1127  }
1128 
1129  /* Turn bams into cram_records and gather basic stats */
1130  for (r1 = sn = 0; r1 < c->curr_c_rec; sn++) {
1131  cram_slice *s = c->slices[sn];
1132  int first_base = INT_MAX, last_base = INT_MIN;
1133 
1134  assert(sn < c->curr_slice);
1135 
1136  /* FIXME: we could create our slice objects here too instead of
1137  * in cram_put_bam_seq. It's more natural here and also this is
1138  * bit is threaded so it's less work in the main thread.
1139  */
1140 
1141  for (r2 = 0; r1 < c->curr_c_rec && r2 < c->max_rec; r1++, r2++) {
1142  cram_record *cr = &s->crecs[r2];
1143  bam_seq_t *b = c->bams[r1];
1144 
1145  /* If multi-ref we need to cope with changing reference per seq */
1146  if (c->multi_seq && !fd->no_ref) {
1147  if (bam_ref(b) != c->ref_seq_id && bam_ref(b) >= 0) {
1148  if (c->ref_seq_id >= 0)
1149  cram_ref_decr(fd->refs, c->ref_seq_id);
1150 
1151  if (!cram_get_ref(fd, bam_ref(b), 1, 0)) {
1152  fprintf(stderr, "Failed to load reference #%d\n",
1153  bam_ref(b));
1154  return -1;
1155  }
1156 
1157  c->ref_seq_id = bam_ref(b); // overwritten later by -2
1158  assert(fd->refs->ref_id[c->ref_seq_id]->seq);
1159  c->ref = fd->refs->ref_id[c->ref_seq_id]->seq;
1160  c->ref_start = 1;
1161  c->ref_end = fd->refs->ref_id[c->ref_seq_id]->length;
1162  }
1163  }
1164 
1165  process_one_read(fd, c, s, cr, b, r2);
1166 
1167  if (first_base > cr->apos)
1168  first_base = cr->apos;
1169 
1170  if (last_base < cr->aend)
1171  last_base = cr->aend;
1172  }
1173 
1174  if (c->multi_seq) {
1175  s->hdr->ref_seq_id = -2;
1176  s->hdr->ref_seq_start = 0;
1177  s->hdr->ref_seq_span = 0;
1178  } else {
1179  s->hdr->ref_seq_id = c->ref_id;
1180  s->hdr->ref_seq_start = first_base;
1181  s->hdr->ref_seq_span = last_base - first_base + 1;
1182  }
1183  s->hdr->num_records = r2;
1184  }
1185 
1186  /* Link our bams[] array onto the spare bam list for reuse */
1187  spares = malloc(sizeof(*spares));
1188  pthread_mutex_lock(&fd->bam_list_lock);
1189  spares->bams = c->bams;
1190  spares->next = fd->bl;
1191  fd->bl = spares;
1192  pthread_mutex_unlock(&fd->bam_list_lock);
1193  c->bams = NULL;
1194 
1195  /* Detect if a multi-seq container */
1196  cram_stats_encoding(fd, c->RI_stats);
1197  multi_ref = c->RI_stats->nvals > 1;
1198 
1199  if (multi_ref) {
1200  if (fd->verbose)
1201  fprintf(stderr, "Multi-ref container\n");
1202  c->ref_seq_id = -2;
1203  c->ref_seq_start = 0;
1204  c->ref_seq_span = 0;
1205  }
1206 
1207 
1208  /* Compute MD5s */
1209  for (i = 0; i < c->curr_slice; i++) {
1210  cram_slice *s = c->slices[i];
1211 
1212  if (fd->version != CRAM_1_VERS) {
1213  if (s->hdr->ref_seq_id >= 0 && c->multi_seq == 0 && !fd->no_ref) {
1214  MD5_CTX md5;
1215  MD5_Init(&md5);
1216  MD5_Update(&md5,
1217  c->ref + s->hdr->ref_seq_start - c->ref_start,
1218  s->hdr->ref_seq_span);
1219  MD5_Final(s->hdr->md5, &md5);
1220  } else {
1221  memset(s->hdr->md5, 0, 16);
1222  }
1223  }
1224  }
1225 
1226  c->num_records = 0;
1227  c->num_blocks = 0;
1228  c->length = 0;
1229 
1230  //fprintf(stderr, "=== BF ===\n");
1232  c->BF_stats, E_INT, NULL,
1233  fd->version);
1234 
1235  //fprintf(stderr, "=== CF ===\n");
1237  c->CF_stats, E_INT, NULL,
1238  fd->version);
1239 // fprintf(stderr, "=== RN ===\n");
1240 // h->RN_codec = cram_encoder_init(cram_stats_encoding(fd, c->RN_stats),
1241 // c->RN_stats, E_BYTE_ARRAY, NULL,
1242 // fd->version);
1243 
1244  //fprintf(stderr, "=== AP ===\n");
1245  if (c->pos_sorted) {
1247  c->AP_stats, E_INT, NULL,
1248  fd->version);
1249  } else {
1250  int p[2] = {0, c->max_apos};
1251  h->AP_codec = cram_encoder_init(E_BETA, NULL, E_INT, p, fd->version);
1252  }
1253 
1254  //fprintf(stderr, "=== RG ===\n");
1256  c->RG_stats, E_INT, NULL,
1257  fd->version);
1258 
1259  //fprintf(stderr, "=== MQ ===\n");
1261  c->MQ_stats, E_INT, NULL,
1262  fd->version);
1263 
1264  //fprintf(stderr, "=== NS ===\n");
1265 #ifdef NS_external
1267  (void *)CRAM_EXT_NS,
1268  fd->version);
1269 #else
1271  c->NS_stats, E_INT, NULL,
1272  fd->version);
1273 #endif
1274 
1275  //fprintf(stderr, "=== MF ===\n");
1277  c->MF_stats, E_INT, NULL,
1278  fd->version);
1279 
1280 #ifdef TS_external
1282  (void *)CRAM_EXT_TS_NP,
1283  fd->version);
1285  (void *)CRAM_EXT_TS_NP,
1286  fd->version);
1287 #else
1288  //fprintf(stderr, "=== TS ===\n");
1290  c->TS_stats, E_INT, NULL,
1291  fd->version);
1292  //fprintf(stderr, "=== NP ===\n");
1294  c->NP_stats, E_INT, NULL,
1295  fd->version);
1296 #endif
1297 
1298  //fprintf(stderr, "=== NF ===\n");
1300  c->NF_stats, E_INT, NULL,
1301  fd->version);
1302 
1303  //fprintf(stderr, "=== RL ===\n");
1305  c->RL_stats, E_INT, NULL,
1306  fd->version);
1307 
1308  //fprintf(stderr, "=== FN ===\n");
1310  c->FN_stats, E_INT, NULL,
1311  fd->version);
1312 
1313  //fprintf(stderr, "=== FC ===\n");
1315  c->FC_stats, E_BYTE, NULL,
1316  fd->version);
1317 
1318  //fprintf(stderr, "=== FP ===\n");
1320  c->FP_stats, E_INT, NULL,
1321  fd->version);
1322 
1323  //fprintf(stderr, "=== DL ===\n");
1325  c->DL_stats, E_INT, NULL,
1326  fd->version);
1327 
1328 #ifdef BA_external
1330  (void *)CRAM_EXT_BA,
1331  fd->version);
1332 #else
1333  //fprintf(stderr, "=== BA ===\n");
1335  c->BA_stats, E_BYTE, NULL,
1336  fd->version);
1337 #endif
1338 
1339  //fprintf(stderr, "=== BS ===\n");
1341  c->BS_stats, E_BYTE, NULL,
1342  fd->version);
1343 
1344  if (fd->version == CRAM_1_VERS) {
1345  h->TL_codec = NULL;
1346  h->RI_codec = NULL;
1347  h->RS_codec = NULL;
1348  h->PD_codec = NULL;
1349  h->HC_codec = NULL;
1350  h->SC_codec = NULL;
1351 
1352  //fprintf(stderr, "=== TC ===\n");
1354  c->TC_stats, E_BYTE, NULL,
1355  fd->version);
1356 
1357  //fprintf(stderr, "=== TN ===\n");
1358 #ifdef TN_external
1360  (void *)CRAM_EXT_TN,
1361  fd->version);
1362 #else
1364  c->TN_stats, E_INT, NULL,
1365  fd->version);
1366 #endif
1367  } else {
1368  int i2[2] = {0, CRAM_EXT_SC};
1369 
1370  h->TC_codec = NULL;
1371  h->TN_codec = NULL;
1372 
1373  //fprintf(stderr, "=== TL ===\n");
1375  c->TL_stats, E_INT, NULL,
1376  fd->version);
1377 
1378 
1379  //fprintf(stderr, "=== RI ===\n");
1381  c->RI_stats, E_INT, NULL,
1382  fd->version);
1383 
1384  //fprintf(stderr, "=== RS ===\n");
1386  c->RS_stats, E_INT, NULL,
1387  fd->version);
1388 
1389  //fprintf(stderr, "=== PD ===\n");
1391  c->PD_stats, E_INT, NULL,
1392  fd->version);
1393 
1394  //fprintf(stderr, "=== HC ===\n");
1396  c->HC_stats, E_INT, NULL,
1397  fd->version);
1398 
1399  //fprintf(stderr, "=== SC ===\n");
1401  E_BYTE_ARRAY, (void *)i2,
1402  fd->version);
1403  }
1404 
1405  //fprintf(stderr, "=== IN ===\n");
1406  {
1407  int i2[2] = {0, CRAM_EXT_IN};
1409  E_BYTE_ARRAY, (void *)i2,
1410  fd->version);
1411  }
1412 
1413  {
1414  //int i2[2] = {0, 1};
1415  //h->QS_codec = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL, (void *)i2,
1416  // fd->version);
1418  (void *)CRAM_EXT_QUAL,
1419  fd->version);
1420  }
1421  {
1422  int i2[2] = {0, CRAM_EXT_NAME};
1424  E_BYTE_ARRAY, (void *)i2,
1425  fd->version);
1426  }
1427 
1428 
1429  /* Encode slices */
1430  for (i = 0; i < c->curr_slice; i++) {
1431  if (fd->verbose)
1432  fprintf(stderr, "Encode slice %d\n", i);
1433  if (cram_encode_slice(fd, c, h, c->slices[i]) != 0)
1434  return -1;
1435  }
1436 
1437  /* Create compression header */
1438  {
1439  h->ref_seq_id = c->ref_seq_id;
1440  h->ref_seq_start = c->ref_seq_start;
1441  h->ref_seq_span = c->ref_seq_span;
1442  h->num_records = c->num_records;
1443 
1444  h->mapped_qs_included = 0; // fixme
1445  h->unmapped_qs_included = 0; // fixme
1446  // h->... fixme
1447  memcpy(h->substitution_matrix, CRAM_SUBST_MATRIX, 20);
1448 
1449  if (!(c_hdr = cram_encode_compression_header(fd, c, h)))
1450  return -1;
1451  }
1452 
1453  /* Compute landmarks */
1454  /* Fill out slice landmarks */
1455  c->num_landmarks = c->curr_slice;
1456  c->landmark = malloc(c->num_landmarks * sizeof(*c->landmark));
1457  if (!c->landmark)
1458  return -1;
1459 
1460  /*
1461  * Slice offset starts after the first block, so we need to simulate
1462  * writing it to work out the correct offset
1463  */
1464  {
1465  slice_offset = c_hdr->method == RAW
1466  ? c_hdr->uncomp_size
1467  : c_hdr->comp_size;
1468  slice_offset += 2 +
1469  itf8_size(c_hdr->content_id) +
1470  itf8_size(c_hdr->comp_size) +
1471  itf8_size(c_hdr->uncomp_size);
1472  }
1473 
1474  c->ref_seq_id = c->slices[0]->hdr->ref_seq_id;
1475  c->ref_seq_start = c->slices[0]->hdr->ref_seq_start;
1476  c->ref_seq_span = c->slices[0]->hdr->ref_seq_span;
1477  for (i = 0; i < c->curr_slice; i++) {
1478  cram_slice *s = c->slices[i];
1479 
1480  c->num_blocks += s->hdr->num_blocks + 2;
1481  c->landmark[i] = slice_offset;
1482 
1483  if (s->hdr->ref_seq_start + s->hdr->ref_seq_span >
1484  c->ref_seq_start + c->ref_seq_span) {
1486  - c->ref_seq_start;
1487  }
1488 
1489  slice_offset += s->hdr_block->method == RAW
1490  ? s->hdr_block->uncomp_size
1491  : s->hdr_block->comp_size;
1492 
1493  slice_offset += 2 +
1497 
1498  for (j = 0; j < s->hdr->num_blocks; j++) {
1499  slice_offset += 2 +
1500  itf8_size(s->block[j]->content_id) +
1501  itf8_size(s->block[j]->comp_size) +
1502  itf8_size(s->block[j]->uncomp_size);
1503 
1504  slice_offset += s->block[j]->method == RAW
1505  ? s->block[j]->uncomp_size
1506  : s->block[j]->comp_size;
1507  }
1508  }
1509  c->length += slice_offset; // just past the final slice
1510 
1511  c->comp_hdr_block = c_hdr;
1512 
1513  if (c->ref_seq_id >= 0) {
1514  cram_ref_decr(fd->refs, c->ref_seq_id);
1515  }
1516 
1517  /* Cache references up-front if we have unsorted access patterns */
1518  if (c->refs_used) {
1519  for (i = 0; i < fd->refs->nref; i++) {
1520  if (c->refs_used[i])
1521  cram_ref_decr(fd->refs, i);
1522  }
1523  }
1524 
1525  return 0;
1526 }
1527 
1528 
1529 /*
1530  * Adds a feature code to a read within a slice. For purposes of minimising
1531  * memory allocations and fragmentation we have one array of features for all
1532  * reads within the slice. We return the index into this array for this new
1533  * feature.
1534  *
1535  * Returns feature index on success
1536  * -1 on failure.
1537  */
1538 static int cram_add_feature(cram_container *c, cram_slice *s,
1539  cram_record *r, cram_feature *f) {
1540  if (s->nfeatures >= s->afeatures) {
1541  s->afeatures = s->afeatures ? s->afeatures*2 : 1024;
1542  s->features = realloc(s->features, s->afeatures*sizeof(*s->features));
1543  if (!s->features)
1544  return -1;
1545  }
1546 
1547  if (!r->nfeature++) {
1548  r->feature = s->nfeatures;
1549  cram_stats_add(c->FP_stats, f->X.pos);
1550  } else {
1552  f->X.pos - s->features[r->feature + r->nfeature-2].X.pos);
1553  }
1554  cram_stats_add(c->FC_stats, f->X.code);
1555 
1556  s->features[s->nfeatures++] = *f;
1557 
1558  return 0;
1559 }
1560 
1561 static int cram_add_substitution(cram_fd *fd, cram_container *c,
1562  cram_slice *s, cram_record *r,
1563  int pos, char base, char qual, char ref) {
1564  cram_feature f;
1565 
1566  // seq=ACGTN vs ref=ACGT or seq=ACGT vs ref=ACGTN
1567  if (fd->L2[(uc)base]<4 || (fd->L2[(uc)base]<5 && fd->L2[(uc)ref]<4)) {
1568  f.X.pos = pos+1;
1569  f.X.code = 'X';
1570  f.X.base = fd->cram_sub_matrix[ref&0x1f][base&0x1f];
1571  cram_stats_add(c->BS_stats, f.X.base);
1572  } else {
1573  f.B.pos = pos+1;
1574  f.B.code = 'B';
1575  f.B.base = base;
1576  f.B.qual = qual;
1577  cram_stats_add(c->BA_stats, f.B.base);
1578  cram_stats_add(c->QS_stats, f.B.qual);
1579  BLOCK_APPEND_CHAR(s->qual_blk, qual);
1580  }
1581  return cram_add_feature(c, s, r, &f);
1582 }
1583 
1584 static int cram_add_base(cram_fd *fd, cram_container *c,
1585  cram_slice *s, cram_record *r,
1586  int pos, char base, char qual) {
1587  cram_feature f;
1588  f.B.pos = pos+1;
1589  f.B.code = 'B';
1590  f.B.base = base;
1591  f.B.qual = qual;
1592 #ifdef BA_external
1593  s->BA_len++;
1594 #else
1595  cram_stats_add(c->BA_stats, base);
1596 #endif
1597  cram_stats_add(c->QS_stats, qual);
1598  BLOCK_APPEND_CHAR(s->qual_blk, qual);
1599  return cram_add_feature(c, s, r, &f);
1600 }
1601 
1602 static int cram_add_quality(cram_fd *fd, cram_container *c,
1603  cram_slice *s, cram_record *r,
1604  int pos, char qual) {
1605  cram_feature f;
1606  f.Q.pos = pos+1;
1607  f.Q.code = 'Q';
1608  f.Q.qual = qual;
1609  cram_stats_add(c->QS_stats, qual);
1610  BLOCK_APPEND_CHAR(s->qual_blk, qual);
1611  return cram_add_feature(c, s, r, &f);
1612 }
1613 
1614 static int cram_add_deletion(cram_container *c, cram_slice *s, cram_record *r,
1615  int pos, int len, char *base) {
1616  cram_feature f;
1617  f.D.pos = pos+1;
1618  f.D.code = 'D';
1619  f.D.len = len;
1620  cram_stats_add(c->DL_stats, len);
1621  return cram_add_feature(c, s, r, &f);
1622 }
1623 
1624 static int cram_add_softclip(cram_container *c, cram_slice *s, cram_record *r,
1625  int pos, int len, char *base, int version) {
1626  cram_feature f;
1627  f.S.pos = pos+1;
1628  f.S.code = 'S';
1629  f.S.len = len;
1630  if (version == CRAM_1_VERS) {
1631  f.S.seq_idx = BLOCK_SIZE(s->base_blk);
1632  BLOCK_APPEND(s->base_blk, base, len);
1633  BLOCK_APPEND_CHAR(s->base_blk, '\0');
1634  } else {
1635  f.S.seq_idx = BLOCK_SIZE(s->soft_blk);
1636  if (base) {
1637  BLOCK_APPEND(s->soft_blk, base, len);
1638  } else {
1639  int i;
1640  for (i = 0; i < len; i++)
1641  BLOCK_APPEND_CHAR(s->soft_blk, 'N');
1642  }
1643  BLOCK_APPEND_CHAR(s->soft_blk, '\0');
1644  }
1645  return cram_add_feature(c, s, r, &f);
1646 }
1647 
1648 static int cram_add_hardclip(cram_container *c, cram_slice *s, cram_record *r,
1649  int pos, int len, char *base) {
1650  cram_feature f;
1651  f.S.pos = pos+1;
1652  f.S.code = 'H';
1653  f.S.len = len;
1654  cram_stats_add(c->HC_stats, len);
1655  return cram_add_feature(c, s, r, &f);
1656 }
1657 
1658 static int cram_add_skip(cram_container *c, cram_slice *s, cram_record *r,
1659  int pos, int len, char *base) {
1660  cram_feature f;
1661  f.S.pos = pos+1;
1662  f.S.code = 'N';
1663  f.S.len = len;
1664  cram_stats_add(c->RS_stats, len);
1665  return cram_add_feature(c, s, r, &f);
1666 }
1667 
1668 static int cram_add_pad(cram_container *c, cram_slice *s, cram_record *r,
1669  int pos, int len, char *base) {
1670  cram_feature f;
1671  f.S.pos = pos+1;
1672  f.S.code = 'P';
1673  f.S.len = len;
1674  cram_stats_add(c->PD_stats, len);
1675  return cram_add_feature(c, s, r, &f);
1676 }
1677 
1678 static int cram_add_insertion(cram_container *c, cram_slice *s, cram_record *r,
1679  int pos, int len, char *base) {
1680  cram_feature f;
1681  f.I.pos = pos+1;
1682  if (len == 1) {
1683  char b = base ? *base : 'N';
1684  f.i.code = 'i';
1685  f.i.base = b;
1686 #ifdef BA_external
1687  s->BA_len++;
1688 #else
1689  cram_stats_add(c->BA_stats, b);
1690 #endif
1691  } else {
1692  f.I.code = 'I';
1693  f.I.len = len;
1694  f.S.seq_idx = BLOCK_SIZE(s->base_blk);
1695  if (base) {
1696  BLOCK_APPEND(s->base_blk, base, len);
1697  } else {
1698  int i;
1699  for (i = 0; i < len; i++)
1700  BLOCK_APPEND_CHAR(s->base_blk, 'N');
1701  }
1702  BLOCK_APPEND_CHAR(s->base_blk, '\0');
1703  }
1704  return cram_add_feature(c, s, r, &f);
1705 }
1706 
1707 /*
1708  * Encodes auxiliary data.
1709  * Returns the read-group parsed out of the BAM aux fields on success
1710  * NULL on failure or no rg present (FIXME)
1711  */
1712 static char *cram_encode_aux_1_0(cram_fd *fd, bam_seq_t *b, cram_container *c,
1713  cram_slice *s, cram_record *cr) {
1714  char *aux, *tmp, *rg = NULL, *tmp_tn;
1715  int aux_size = bam_blk_size(b) -
1716  ((char *)bam_aux(b) - (char *)&bam_ref(b));
1717 
1718  /* Worst case is 1 nul char on every ??:Z: string, so +33% */
1719  BLOCK_GROW(s->aux_blk, aux_size*1.34+1);
1720  tmp = (char *)BLOCK_END(s->aux_blk);
1721 
1722 #ifdef TN_external
1723  BLOCK_GROW(s->tn_blk, aux_size);
1724  tmp_tn = (char *)BLOCK_END(s->tn_blk);
1725 #endif
1726 
1727  aux = (char *)bam_aux(b);
1728 #ifndef TN_external
1729  cr->TN_idx = s->nTN;
1730 #endif
1731  while (aux[0] != 0) {
1732  int32_t i32;
1733  int r;
1734 
1735  if (aux[0] == 'R' && aux[1] == 'G' && aux[2] == 'Z') {
1736  rg = &aux[3];
1737  while (*aux++);
1738  continue;
1739  }
1740  if (aux[0] == 'M' && aux[1] == 'D' && aux[2] == 'Z') {
1741  while (*aux++);
1742  continue;
1743  }
1744  if (aux[0] == 'N' && aux[1] == 'M') {
1745  switch(aux[2]) {
1746  case 'A': case 'C': case 'c': aux+=4; break;
1747  case 'I': case 'i': case 'f': aux+=7; break;
1748  default:
1749  fprintf(stderr, "Unhandled type code for NM tag\n");
1750  return NULL;
1751  }
1752  continue;
1753  }
1754 
1755  cr->ntags++;
1756 
1757  i32 = (aux[0]<<16) | (aux[1]<<8) | aux[2];
1758  kh_put(s_i2i, c->tags_used, i32, &r);
1759  if (-1 == r)
1760  return NULL;
1761 
1762 #ifndef TN_external
1763  if (s->nTN >= s->aTN) {
1764  s->aTN = s->aTN ? s->aTN*2 : 1024;
1765  if (!(s->TN = realloc(s->TN, s->aTN * sizeof(*s->TN))))
1766  return NULL;
1767  }
1768  s->TN[s->nTN++] = i32;
1769  cram_stats_add(c->TN_stats, i32);
1770 #else
1771  tmp_tn += itf8_put(tmp_tn, i32);
1772 #endif
1773 
1774  switch(aux[2]) {
1775  case 'A': case 'C': case 'c':
1776  aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
1777  *tmp++=*aux++;
1778  break;
1779 
1780  case 'S': case 's':
1781  aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
1782  *tmp++=*aux++; *tmp++=*aux++;
1783  break;
1784 
1785  case 'I': case 'i': case 'f':
1786  aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
1787  *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
1788  break;
1789 
1790  case 'd':
1791  aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
1792  *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
1793  *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
1794  break;
1795 
1796  case 'Z': case 'H':
1797  aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
1798  while ((*tmp++=*aux++));
1799  *tmp++ = '\t'; // stop byte
1800  break;
1801 
1802  case 'B': {
1803  int type = aux[3], blen;
1804  uint32_t count = (uint32_t)((((unsigned char *)aux)[4]<< 0) +
1805  (((unsigned char *)aux)[5]<< 8) +
1806  (((unsigned char *)aux)[6]<<16) +
1807  (((unsigned char *)aux)[7]<<24));
1808  // skip TN field
1809  aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
1810 
1811  // We use BYTE_ARRAY_LEN with external length, so store that first
1812  switch (type) {
1813  case 'c': case 'C':
1814  blen = count;
1815  break;
1816  case 's': case 'S':
1817  blen = 2*count;
1818  break;
1819  case 'i': case 'I': case 'f':
1820  blen = 4*count;
1821  break;
1822  default:
1823  fprintf(stderr, "Unknown sub-type '%c' for aux type 'B'\n",
1824  type);
1825  return NULL;
1826 
1827  }
1828 
1829  tmp += itf8_put(tmp, blen+5);
1830 
1831  *tmp++=*aux++; // sub-type & length
1832  *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
1833 
1834  // The tag data itself
1835  memcpy(tmp, aux, blen); tmp += blen; aux += blen;
1836 
1837  //cram_stats_add(c->aux_B_stats, blen);
1838  break;
1839  }
1840  default:
1841  fprintf(stderr, "Unknown aux type '%c'\n", aux[2]);
1842  return NULL;
1843  }
1844  }
1845  cram_stats_add(c->TC_stats, cr->ntags);
1846 
1847  cr->aux = BLOCK_SIZE(s->aux_blk);
1848  cr->aux_size = (uc *)tmp - (BLOCK_DATA(s->aux_blk) + cr->aux);
1849  BLOCK_SIZE(s->aux_blk) = (uc *)tmp - BLOCK_DATA(s->aux_blk);
1850  assert(s->aux_blk->byte <= s->aux_blk->alloc);
1851 
1852 #ifdef TN_external
1853  cr->tn = BLOCK_SIZE(s->tn_blk);
1854  BLOCK_SIZE(s->tn_blk) = (uc *)tmp_tn - BLOCK_DATA(s->tn_blk);
1855  assert(s->tn_blk->byte <= s->tn_blk->alloc);
1856 #endif
1857 
1858  return rg;
1859 }
1860 
1861 /*
1862  * Encodes auxiliary data. Largely duplicated from above, but done so to
1863  * keep it simple and avoid a myriad of version ifs.
1864  *
1865  * Returns the read-group parsed out of the BAM aux fields on success
1866  * NULL on failure or no rg present (FIXME)
1867  */
1868 static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c,
1869  cram_slice *s, cram_record *cr) {
1870  char *aux, *orig, *tmp, *rg = NULL;
1871 #ifdef SAMTOOLS
1872  int aux_size = bam_get_l_aux(b);
1873 #else
1874  int aux_size = bam_blk_size(b) -
1875  ((char *)bam_aux(b) - (char *)&bam_ref(b));
1876 #endif
1877  cram_block *td_b = c->comp_hdr->TD_blk;
1878  int TD_blk_size = BLOCK_SIZE(td_b), new;
1879  char *key;
1880  khint_t k;
1881 
1882 
1883  /* Worst case is 1 nul char on every ??:Z: string, so +33% */
1884  BLOCK_GROW(s->aux_blk, aux_size*1.34+1);
1885  tmp = (char *)BLOCK_END(s->aux_blk);
1886 
1887 
1888  orig = aux = (char *)bam_aux(b);
1889 
1890  // Copy aux keys to td_b and aux values to s->aux_blk
1891  while (aux - orig < aux_size && aux[0] != 0) {
1892  uint32_t i32;
1893  int r;
1894 
1895  if (aux[0] == 'R' && aux[1] == 'G' && aux[2] == 'Z') {
1896  rg = &aux[3];
1897  while (*aux++);
1898  continue;
1899  }
1900  if (aux[0] == 'M' && aux[1] == 'D' && aux[2] == 'Z') {
1901  while (*aux++);
1902  continue;
1903  }
1904  if (aux[0] == 'N' && aux[1] == 'M') {
1905  switch(aux[2]) {
1906  case 'A': case 'C': case 'c': aux+=4; break;
1907  case 'S': case 's': aux+=5; break;
1908  case 'I': case 'i': case 'f': aux+=7; break;
1909  default:
1910  fprintf(stderr, "Unhandled type code for NM tag\n");
1911  return NULL;
1912  }
1913  continue;
1914  }
1915 
1916  BLOCK_APPEND(td_b, aux, 3);
1917 
1918  i32 = (aux[0]<<16) | (aux[1]<<8) | aux[2];
1919  kh_put(s_i2i, c->tags_used, i32, &r);
1920  if (-1 == r)
1921  return NULL;
1922 
1923  switch(aux[2]) {
1924  case 'A': case 'C': case 'c':
1925  aux+=3;
1926  *tmp++=*aux++;
1927  break;
1928 
1929  case 'S': case 's':
1930  aux+=3;
1931  *tmp++=*aux++; *tmp++=*aux++;
1932  break;
1933 
1934  case 'I': case 'i': case 'f':
1935  aux+=3;
1936  *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
1937  break;
1938 
1939  case 'd':
1940  aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
1941  *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
1942  *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
1943 
1944  case 'Z': case 'H':
1945  aux+=3;
1946  while ((*tmp++=*aux++));
1947  *tmp++ = '\t'; // stop byte
1948  break;
1949 
1950  case 'B': {
1951  int type = aux[3], blen;
1952  uint32_t count = (uint32_t)((((unsigned char *)aux)[4]<< 0) +
1953  (((unsigned char *)aux)[5]<< 8) +
1954  (((unsigned char *)aux)[6]<<16) +
1955  (((unsigned char *)aux)[7]<<24));
1956  // skip TN field
1957  aux+=3;
1958 
1959  // We use BYTE_ARRAY_LEN with external length, so store that first
1960  switch (type) {
1961  case 'c': case 'C':
1962  blen = count;
1963  break;
1964  case 's': case 'S':
1965  blen = 2*count;
1966  break;
1967  case 'i': case 'I': case 'f':
1968  blen = 4*count;
1969  break;
1970  default:
1971  fprintf(stderr, "Unknown sub-type '%c' for aux type 'B'\n",
1972  type);
1973  return NULL;
1974 
1975  }
1976 
1977  tmp += itf8_put(tmp, blen+5);
1978 
1979  *tmp++=*aux++; // sub-type & length
1980  *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
1981 
1982  // The tag data itself
1983  memcpy(tmp, aux, blen); tmp += blen; aux += blen;
1984 
1985  //cram_stats_add(c->aux_B_stats, blen);
1986  break;
1987  }
1988  default:
1989  fprintf(stderr, "Unknown aux type '%c'\n", aux[2]);
1990  return NULL;
1991  }
1992  }
1993 
1994  // FIXME: sort BLOCK_DATA(td_b) by char[3] triples
1995 
1996  // And and increment TD hash entry
1997  BLOCK_APPEND_CHAR(td_b, 0);
1998 
1999  // Duplicate key as BLOCK_DATA() can be realloced to a new pointer.
2000  key = string_ndup(c->comp_hdr->TD_keys,
2001  (char *)BLOCK_DATA(td_b) + TD_blk_size,
2002  BLOCK_SIZE(td_b) - TD_blk_size);
2003  k = kh_put(m_s2i, c->comp_hdr->TD_hash, key, &new);
2004  if (new < 0) {
2005  return NULL;
2006  } else if (new == 0) {
2007  BLOCK_SIZE(td_b) = TD_blk_size;
2008  } else {
2009  kh_val(c->comp_hdr->TD_hash, k) = c->comp_hdr->nTL;
2010  c->comp_hdr->nTL++;
2011  }
2012 
2013  cr->TL = kh_val(c->comp_hdr->TD_hash, k);
2014  cram_stats_add(c->TL_stats, cr->TL);
2015 
2016  cr->aux = BLOCK_SIZE(s->aux_blk);
2017  cr->aux_size = (uc *)tmp - (BLOCK_DATA(s->aux_blk) + cr->aux);
2018  BLOCK_SIZE(s->aux_blk) = (uc *)tmp - BLOCK_DATA(s->aux_blk);
2019  assert(s->aux_blk->byte <= s->aux_blk->alloc);
2020 
2021  return rg;
2022 }
2023 
2024 
2025 /*
2026  * Handles creation of a new container or new slice, flushing any
2027  * existing containers when appropriate.
2028  *
2029  * Really this is next slice, which may or may not lead to a new container.
2030  *
2031  * Returns cram_container pointer on success
2032  * NULL on failure.
2033  */
2034 static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) {
2035  cram_container *c = fd->ctr;
2036  cram_slice *s;
2037  int i;
2038 
2039  /* First occurence */
2040  if (c->curr_ref == -2)
2041  c->curr_ref = bam_ref(b);
2042 
2043  if (c->slice) {
2044  s = c->slice;
2045  if (c->multi_seq) {
2046  s->hdr->ref_seq_id = -2;
2047  s->hdr->ref_seq_start = 0;
2048  s->hdr->ref_seq_span = 0;
2049  } else {
2050  s->hdr->ref_seq_id = c->curr_ref;
2051  s->hdr->ref_seq_start = c->first_base;
2052  s->hdr->ref_seq_span = c->last_base - c->first_base + 1;
2053  }
2054  s->hdr->num_records = c->curr_rec;
2055 
2056  if (c->curr_slice == 0) {
2057  if (c->ref_seq_id != s->hdr->ref_seq_id)
2058  c->ref_seq_id = s->hdr->ref_seq_id;
2059  c->ref_seq_start = c->first_base;
2060  }
2061 
2062  c->curr_slice++;
2063  }
2064 
2065  /* Flush container */
2066  if (c->curr_slice == c->max_slice ||
2067  (bam_ref(b) != c->curr_ref && !c->multi_seq)) {
2068  c->ref_seq_span = fd->last_base - c->ref_seq_start + 1;
2069  if (fd->verbose)
2070  fprintf(stderr, "Flush container %d/%d..%d\n",
2071  c->ref_seq_id, c->ref_seq_start,
2072  c->ref_seq_start + c->ref_seq_span -1);
2073 
2074  /* Encode slices */
2075  if (fd->pool) {
2076  if (-1 == cram_flush_container_mt(fd, c))
2077  return NULL;
2078  } else {
2079  if (-1 == cram_flush_container(fd, c))
2080  return NULL;
2081 
2082  // Move to sep func, as we need cram_flush_container for
2083  // the closing phase to flush the partial container.
2084  for (i = 0; i < c->max_slice; i++) {
2085  cram_free_slice(c->slices[i]);
2086  c->slices[i] = NULL;
2087  }
2088 
2089  c->slice = NULL;
2090  c->curr_slice = 0;
2091 
2092  /* Easy approach for purposes of freeing stats */
2094  }
2095 
2096  c = fd->ctr = cram_new_container(fd->seqs_per_slice,
2097  fd->slices_per_container);
2098  if (!c)
2099  return NULL;
2100  c->record_counter = fd->record_counter;
2101  c->curr_ref = bam_ref(b);
2102  }
2103 
2104  c->last_pos = c->first_base = c->last_base = bam_pos(b)+1;
2105 
2106  /* New slice */
2107  c->slice = c->slices[c->curr_slice] =
2109  if (!c->slice)
2110  return NULL;
2111 
2112  if (c->multi_seq) {
2113  c->slice->hdr->ref_seq_id = -2;
2114  c->slice->hdr->ref_seq_start = 0;
2115  c->slice->last_apos = 1;
2116  } else {
2117  c->slice->hdr->ref_seq_id = bam_ref(b);
2118  // wrong for unsorted data, will fix during encoding.
2119  c->slice->hdr->ref_seq_start = bam_pos(b)+1;
2120  c->slice->last_apos = bam_pos(b)+1;
2121  }
2122 
2123  c->curr_rec = 0;
2124 
2125  return c;
2126 }
2127 
2128 /*
2129  * Converts a single bam record into a cram record.
2130  * Possibly used within a thread.
2131  *
2132  * Returns 0 on success;
2133  * -1 on failure
2134  */
2135 static int process_one_read(cram_fd *fd, cram_container *c,
2136  cram_slice *s, cram_record *cr,
2137  bam_seq_t *b, int rnum) {
2138  int i, fake_qual = 0;
2139  char *cp, *rg;
2140  char *ref, *seq, *qual;
2141 
2142  // FIXME: multi-ref containers
2143 
2144  ref = c->ref;
2145 
2146  //fprintf(stderr, "%s => %d\n", rg ? rg : "\"\"", cr->rg);
2147 
2148  // Fields to resolve later
2149  //cr->mate_line; // index to another cram_record
2150  //cr->mate_flags; // MF
2151  //cr->ntags; // TC
2152  cr->ntags = 0; //cram_stats_add(c->TC_stats, cr->ntags);
2153  if (fd->version == CRAM_1_VERS)
2154  rg = cram_encode_aux_1_0(fd, b, c, s, cr);
2155  else
2156  rg = cram_encode_aux(fd, b, c, s, cr);
2157 
2158  //cr->aux_size = b->blk_size - ((char *)bam_aux(b) - (char *)&bam_ref(b));
2159  //cr->aux = DSTRING_LEN(s->aux_ds);
2160  //dstring_nappend(s->aux_ds, bam_aux(b), cr->aux_size);
2161 
2162  /* Read group, identified earlier */
2163  if (rg) {
2164  SAM_RG *brg = sam_hdr_find_rg(fd->header, rg);
2165  cr->rg = brg ? brg->id : -1;
2166  } else if (fd->version == CRAM_1_VERS) {
2167  SAM_RG *brg = sam_hdr_find_rg(fd->header, "UNKNOWN");
2168  assert(brg);
2169  } else {
2170  cr->rg = -1;
2171  }
2172  cram_stats_add(c->RG_stats, cr->rg);
2173 
2174 
2175  cr->ref_id = bam_ref(b); cram_stats_add(c->RI_stats, cr->ref_id);
2176  cr->flags = bam_flag(b);
2177  if (bam_cigar_len(b) == 0)
2178  cr->flags |= BAM_FUNMAP;
2179  cram_stats_add(c->BF_stats, fd->cram_flag_swap[cr->flags & 0xfff]);
2180 
2181  if (!fd->no_ref)
2183  else
2184  cr->cram_flags = 0;
2185  //cram_stats_add(c->CF_stats, cr->cram_flags);
2186 
2187  cr->len = bam_seq_len(b); cram_stats_add(c->RL_stats, cr->len);
2188  c->num_bases += cr->len;
2189  cr->apos = bam_pos(b)+1;
2190  if (c->pos_sorted) {
2191  if (cr->apos < s->last_apos) {
2192  c->pos_sorted = 0;
2193  } else {
2194  cram_stats_add(c->AP_stats, cr->apos - s->last_apos);
2195  s->last_apos = cr->apos;
2196  }
2197  } else {
2198  //cram_stats_add(c->AP_stats, cr->apos);
2199  }
2200  c->max_apos += (cr->apos > c->max_apos) * (cr->apos - c->max_apos);
2201 
2202  cr->name = BLOCK_SIZE(s->name_blk);
2203  cr->name_len = bam_name_len(b);
2204  cram_stats_add(c->RN_stats, cr->name_len);
2205 
2207 
2208 
2209  /*
2210  * This seqs_ds is largely pointless and it could reuse the same memory
2211  * over and over.
2212  * s->base_ds is what we need for encoding.
2213  */
2214  cr->seq = BLOCK_SIZE(s->seqs_blk);
2215  cr->qual = BLOCK_SIZE(s->qual_blk);
2216  BLOCK_GROW(s->seqs_blk, cr->len+1);
2217  BLOCK_GROW(s->qual_blk, cr->len);
2218  seq = cp = (char *)BLOCK_END(s->seqs_blk);
2219 
2220  *seq = 0;
2221  for (i = 0; i < cr->len; i++) {
2222  // FIXME: do 2 char at a time for efficiency
2223 #ifdef SAMTOOLS
2224  cp[i] = seq_nt16_str[bam_seqi(bam_seq(b), i)];
2225 #else
2226  cp[i] = bam_nt16_rev_table[bam_seqi(bam_seq(b), i)];
2227 #endif
2228  }
2229  BLOCK_SIZE(s->seqs_blk) += cr->len;
2230 
2231  qual = cp = (char *)bam_qual(b);
2232 
2233  /* Copy and parse */
2234  if (!(cr->flags & BAM_FUNMAP)) {
2235  int32_t *cig_to, *cig_from;
2236  int apos = cr->apos-1, spos = 0;
2237 
2238  cr->cigar = s->ncigar;
2239  cr->ncigar = bam_cigar_len(b);
2240  while (cr->cigar + cr->ncigar >= s->cigar_alloc) {
2241  s->cigar_alloc = s->cigar_alloc ? s->cigar_alloc*2 : 1024;
2242  s->cigar = realloc(s->cigar, s->cigar_alloc * sizeof(*s->cigar));
2243  if (!s->cigar)
2244  return -1;
2245  }
2246 
2247  cig_to = (int32_t *)s->cigar;
2248  cig_from = (int32_t *)bam_cigar(b);
2249 
2250  cr->feature = 0;
2251  cr->nfeature = 0;
2252  for (i = 0; i < cr->ncigar; i++) {
2253  enum cigar_op cig_op = cig_from[i] & BAM_CIGAR_MASK;
2254  int cig_len = cig_from[i] >> BAM_CIGAR_SHIFT;
2255  cig_to[i] = cig_from[i];
2256 
2257  /* Can also generate events from here for CRAM diffs */
2258 
2259  switch (cig_op) {
2260  int l;
2261 
2262  // Don't trust = and X ops to be correct.
2263  case BAM_CMATCH:
2264  case BAM_CBASE_MATCH:
2265  case BAM_CBASE_MISMATCH:
2266  //fprintf(stderr, "\nBAM_CMATCH\nR: %.*s\nS: %.*s\n",
2267  // cig_len, &ref[apos], cig_len, &seq[spos]);
2268  l = 0;
2269  if (!fd->no_ref && cr->len) {
2270  int end = cig_len+apos < c->ref_end
2271  ? cig_len : c->ref_end - apos;
2272  for (l = 0; l < end && seq[spos]; l++, apos++, spos++) {
2273  if (ref[apos] != seq[spos]) {
2274  //fprintf(stderr, "Subst: %d; %c vs %c\n",
2275  // spos, ref[apos], seq[spos]);
2276  if (cram_add_substitution(fd, c, s, cr, spos,
2277  seq[spos], qual[spos],
2278  ref[apos]))
2279  return -1;
2280  }
2281  }
2282  }
2283 
2284  if (l < cig_len && cr->len) {
2285  /* off end of sequence or non-ref based output */
2286  for (; l < cig_len && seq[spos]; l++, spos++) {
2287  if (cram_add_base(fd, c, s, cr, spos,
2288  seq[spos], qual[spos]))
2289  return -1;
2290  }
2291  apos += cig_len;
2292  } else if (!cr->len) {
2293  /* Seq "*" */
2294  apos += cig_len;
2295  spos += cig_len;
2296  }
2297  break;
2298 
2299  case BAM_CDEL:
2300  if (cram_add_deletion(c, s, cr, spos, cig_len, &seq[spos]))
2301  return -1;
2302  apos += cig_len;
2303  break;
2304 
2305  case BAM_CREF_SKIP:
2306  if (cram_add_skip(c, s, cr, spos, cig_len, &seq[spos]))
2307  return -1;
2308  apos += cig_len;
2309  break;
2310 
2311  case BAM_CINS:
2312  if (cram_add_insertion(c, s, cr, spos, cig_len,
2313  cr->len ? &seq[spos] : NULL))
2314  return -1;
2315  if (fd->no_ref && cr->len) {
2316  for (l = 0; l < cig_len; l++, spos++) {
2317  cram_add_quality(fd, c, s, cr, spos, qual[spos]);
2318  }
2319  } else {
2320  spos += cig_len;
2321  }
2322  break;
2323 
2324  case BAM_CSOFT_CLIP:
2325  if (cram_add_softclip(c, s, cr, spos, cig_len,
2326  cr->len ? &seq[spos] : NULL,
2327  fd->version))
2328  return -1;
2329  if (fd->no_ref) {
2330  if (cr->len) {
2331  for (l = 0; l < cig_len; l++, spos++) {
2332  cram_add_quality(fd, c, s, cr, spos, qual[spos]);
2333  }
2334  } else {
2335  for (l = 0; l < cig_len; l++, spos++) {
2336  cram_add_quality(fd, c, s, cr, spos, -1);
2337  }
2338  }
2339  } else {
2340  spos += cig_len;
2341  }
2342  break;
2343 
2344  case BAM_CHARD_CLIP:
2345  if (cram_add_hardclip(c, s, cr, spos, cig_len, &seq[spos]))
2346  return -1;
2347  break;
2348 
2349  case BAM_CPAD:
2350  if (cram_add_pad(c, s, cr, spos, cig_len, &seq[spos]))
2351  return -1;
2352  break;
2353  }
2354  }
2355  fake_qual = spos;
2356  cr->aend = MIN(apos, c->ref_end);
2357  cram_stats_add(c->FN_stats, cr->nfeature);
2358  } else {
2359  // Unmapped
2361  cr->cigar = 0;
2362  cr->ncigar = 0;
2363  cr->nfeature = 0;
2364  cr->aend = cr->apos;
2365 #ifdef BA_external
2366  s->BA_len += cr->len;
2367 #else
2368  for (i = 0; i < cr->len; i++)
2369  cram_stats_add(c->BA_stats, seq[i]);
2370 #endif
2371  }
2372 
2373  /*
2374  * Append to the qual block now. We do this here as
2375  * cram_add_substitution() can generate BA/QS events which need to
2376  * be in the qual block before we append the rest of the data.
2377  */
2379  /* Special case of seq "*" */
2380  if (cr->len == 0) {
2381  cram_stats_add(c->RL_stats, cr->len = fake_qual);
2382  BLOCK_GROW(s->qual_blk, cr->len);
2383  cp = (char *)BLOCK_END(s->qual_blk);
2384  memset(cp, 255, cr->len);
2385  } else {
2386  BLOCK_GROW(s->qual_blk, cr->len);
2387  cp = (char *)BLOCK_END(s->qual_blk);
2388  char *from = (char *)&bam_qual(b)[0];
2389  char *to = &cp[0];
2390  memcpy(to, from, cr->len);
2391  //for (i = 0; i < cr->len; i++) cp[i] = from[i];
2392  }
2393  BLOCK_SIZE(s->qual_blk) += cr->len;
2394  } else {
2395  if (cr->len == 0) {
2396  cram_stats_add(c->RL_stats, cr->len = cr->aend - cr->apos + 1);
2397  }
2398  }
2399 
2400  /* Now we know apos and aend both, update mate-pair information */
2401  {
2402  int new;
2403  khint_t k;
2404 
2405  //fprintf(stderr, "Checking %"PRId64"/%.*s\t", rnum,
2406  // cr->name_len, DSTRING_STR(s->name_ds)+cr->name);
2407  if (cr->flags & BAM_FPAIRED) {
2408  char *key = string_ndup(s->pair_keys,
2409  (char *)BLOCK_DATA(s->name_blk)+cr->name,
2410  cr->name_len);
2411  if (!key)
2412  return -1;
2413 
2414  k = kh_put(m_s2i, s->pair, key, &new);
2415  if (-1 == new)
2416  return -1;
2417  else if (new > 0)
2418  kh_val(s->pair, k) = rnum;
2419  } else {
2420  new = 1;
2421  }
2422 
2423  if (new == 0) {
2424  cram_record *p = &s->crecs[kh_val(s->pair, k)];
2425 
2426  //fprintf(stderr, "paired %"PRId64"\n", kh_val(s->pair, k));
2427 
2428  // copy from p to cr
2429  cr->mate_pos = p->apos;
2430  cram_stats_add(c->NP_stats, cr->mate_pos);
2431 
2432  cr->tlen = cr->aend - p->apos;
2433  cram_stats_add(c->TS_stats, cr->tlen);
2434 
2435  cr->mate_flags =
2436  ((p->flags & BAM_FMUNMAP) == BAM_FMUNMAP) * CRAM_M_UNMAP +
2439 
2440  // copy from cr to p
2441  cram_stats_del(c->NP_stats, p->mate_pos);
2442  p->mate_pos = cr->apos;
2443  cram_stats_add(c->NP_stats, p->mate_pos);
2444 
2445  cram_stats_del(c->MF_stats, p->mate_flags);
2446  p->mate_flags =
2447  ((cr->flags & BAM_FMUNMAP) == BAM_FMUNMAP) * CRAM_M_UNMAP +
2449  cram_stats_add(c->MF_stats, p->mate_flags);
2450 
2451  cram_stats_del(c->TS_stats, p->tlen);
2452  p->tlen = p->apos - cr->aend;
2453  cram_stats_add(c->TS_stats, p->tlen);
2454 
2455  // Clear detached from cr flags
2456  //cram_stats_del(c->CF_stats, cr->cram_flags);
2459 
2460  // Clear detached from p flags and set downstream
2461  cram_stats_del(c->CF_stats, p->cram_flags);
2462  p->cram_flags &= ~CRAM_FLAG_DETACHED;
2463  p->cram_flags |= CRAM_FLAG_MATE_DOWNSTREAM;
2464  cram_stats_add(c->CF_stats, p->cram_flags);
2465 
2466  p->mate_line = rnum - (kh_val(s->pair, k) + 1);
2467  cram_stats_add(c->NF_stats, p->mate_line);
2468 
2469  kh_val(s->pair, k) = rnum;
2470  } else {
2471  //fprintf(stderr, "unpaired\n");
2472 
2473  /* Derive mate flags from this flag */
2474  cr->mate_flags = 0;
2475  if (bam_flag(b) & BAM_FMUNMAP)
2476  cr->mate_flags |= CRAM_M_UNMAP;
2477  if (bam_flag(b) & BAM_FMREVERSE)
2478  cr->mate_flags |= CRAM_M_REVERSE;
2479 
2481 
2482  cr->mate_pos = MAX(bam_mate_pos(b)+1, 0);
2483  cram_stats_add(c->NP_stats, cr->mate_pos);
2484 
2485  cr->tlen = bam_ins_size(b);
2486  cram_stats_add(c->TS_stats, cr->tlen);
2487 
2490  }
2491  }
2492 
2493  cr->mqual = bam_map_qual(b);
2494  cram_stats_add(c->MQ_stats, cr->mqual);
2495 
2496  cr->mate_ref_id = bam_mate_ref(b);
2498 
2499  if (!(bam_flag(b) & BAM_FUNMAP)) {
2500  if (c->first_base > cr->apos)
2501  c->first_base = cr->apos;
2502 
2503  if (c->last_base < cr->aend)
2504  c->last_base = cr->aend;
2505  }
2506 
2507  return 0;
2508 }
2509 
2510 /*
2511  * Write iterator: put BAM format sequences into a CRAM file.
2512  * We buffer up a containers worth of data at a time.
2513  *
2514  * Returns 0 on success
2515  * -1 on failure
2516  */
2518  cram_container *c;
2519 
2520  if (!fd->ctr) {
2522  fd->slices_per_container);
2523  if (!fd->ctr)
2524  return -1;
2525  fd->ctr->record_counter = fd->record_counter;
2526  }
2527  c = fd->ctr;
2528 
2529  if (!c->slice || c->curr_rec == c->max_rec ||
2530  (bam_ref(b) != c->curr_ref && c->curr_ref >= -1)) {
2531  int slice_rec, curr_rec, multi_seq = fd->multi_seq == 1;
2532  int curr_ref = c->slice ? c->curr_ref : bam_ref(b);
2533 
2534 
2535  /*
2536  * Start packing slices when we routinely have under 1/4tr full.
2537  *
2538  * This option isn't available if we choose to embed references
2539  * since we can only have one per slice.
2540  */
2541  if (fd->multi_seq == -1 && c->curr_rec < c->max_rec/4+10 &&
2542  fd->last_slice && fd->last_slice < c->max_rec/4+10 &&
2543  !fd->embed_ref) {
2544  if (fd->verbose && !c->multi_seq)
2545  fprintf(stderr, "Multi-ref enabled for this container\n");
2546  multi_seq = 1;
2547  }
2548 
2549  slice_rec = c->slice_rec;
2550  curr_rec = c->curr_rec;
2551 
2552  if (fd->version == CRAM_1_VERS ||
2553  c->curr_rec == c->max_rec || fd->multi_seq != 1 || !c->slice)
2554  if (NULL == (c = cram_next_container(fd, b)))
2555  return -1;
2556 
2557  /*
2558  * Due to our processing order, some things we've already done we
2559  * cannot easily undo. So when we first notice we should be packing
2560  * multiple sequences per container we emit the small partial
2561  * container as-is and then start a fresh one in a different mode.
2562  */
2563  if (multi_seq) {
2564  fd->multi_seq = 1;
2565  c->multi_seq = 1;
2566  c->pos_sorted = 0; // required atm for multi_seq slices
2567 
2568  if (!c->refs_used) {
2569  pthread_mutex_lock(&fd->ref_lock);
2570  c->refs_used = calloc(fd->refs->nref, sizeof(int));
2571  pthread_mutex_unlock(&fd->ref_lock);
2572  if (!c->refs_used)
2573  return -1;
2574  }
2575  }
2576 
2577  fd->last_slice = curr_rec - slice_rec;
2578  c->slice_rec = c->curr_rec;
2579 
2580  // Have we seen this reference before?
2581  if (bam_ref(b) >= 0 && bam_ref(b) != curr_ref && !fd->embed_ref &&
2582  !fd->unsorted) {
2583 
2584  if (!c->refs_used) {
2585  pthread_mutex_lock(&fd->ref_lock);
2586  c->refs_used = calloc(fd->refs->nref, sizeof(int));
2587  pthread_mutex_unlock(&fd->ref_lock);
2588  if (!c->refs_used)
2589  return -1;
2590  } else if (c->refs_used && c->refs_used[bam_ref(b)]) {
2591  fprintf(stderr, "Unsorted mode enabled\n");
2592  pthread_mutex_lock(&fd->ref_lock);
2593  fd->unsorted = 1;
2594  pthread_mutex_unlock(&fd->ref_lock);
2595  fd->multi_seq = 1;
2596  }
2597  }
2598 
2599  c->curr_ref = bam_ref(b);
2600  if (c->refs_used && c->curr_ref >= 0) c->refs_used[c->curr_ref]++;
2601  }
2602 
2603  if (!c->bams) {
2604  /* First time through, allocate a set of bam pointers */
2605  pthread_mutex_lock(&fd->bam_list_lock);
2606  if (fd->bl) {
2607  spare_bams *spare = fd->bl;
2608  c->bams = spare->bams;
2609  fd->bl = spare->next;
2610  free(spare);
2611  } else {
2612  c->bams = calloc(c->max_c_rec, sizeof(bam_seq_t *));
2613  if (!c->bams)
2614  return -1;
2615  }
2616  pthread_mutex_unlock(&fd->bam_list_lock);
2617  }
2618 
2619  /* Copy or alloc+copy the bam record, for later encoding */
2620  if (c->bams[c->curr_c_rec])
2621  bam_copy(&c->bams[c->curr_c_rec], b);
2622  else
2623  c->bams[c->curr_c_rec] = bam_dup(b);
2624 
2625  c->curr_rec++;
2626  c->curr_c_rec++;
2627  fd->record_counter++;
2628 
2629  return 0;
2630 }