NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
sam.c
Go to the documentation of this file.
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <string.h>
4 #include <errno.h>
5 #include <ctype.h>
6 #include <zlib.h>
7 #include "htslib/sam.h"
8 #include "htslib/bgzf.h"
9 #include "cram/cram.h"
10 #include "htslib/hfile.h"
11 
12 #include "htslib/khash.h"
14 
15 typedef khash_t(s2i) sdict_t;
16 
17 /**********************
18  *** BAM header I/O ***
19  **********************/
20 
22 {
23  return (bam_hdr_t*)calloc(1, sizeof(bam_hdr_t));
24 }
25 
27 {
28  int32_t i;
29  if (h == NULL) return;
30  if (h->target_name) {
31  for (i = 0; i < h->n_targets; ++i)
32  free(h->target_name[i]);
33  free(h->target_name);
34  free(h->target_len);
35  }
36  free(h->text); free(h->cigar_tab);
37  if (h->sdict) kh_destroy(s2i, (sdict_t*)h->sdict);
38  free(h);
39 }
40 
42 {
43  if (h0 == NULL) return NULL;
44  bam_hdr_t *h;
45  if ((h = bam_hdr_init()) == NULL) return NULL;
46  // copy the simple data
47  h->n_targets = h0->n_targets;
49  h->l_text = h0->l_text;
50  // Then the pointery stuff
51  h->cigar_tab = NULL;
52  h->sdict = NULL;
53  h->text = (char*)calloc(h->l_text + 1, 1);
54  memcpy(h->text, h0->text, h->l_text);
55  h->target_len = (uint32_t*)calloc(h->n_targets, 4);
56  h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
57  int i;
58  for (i = 0; i < h->n_targets; ++i) {
59  h->target_len[i] = h0->target_len[i];
60  h->target_name[i] = strdup(h0->target_name[i]);
61  }
62  return h;
63 }
64 
65 
66 static bam_hdr_t *hdr_from_dict(sdict_t *d)
67 {
68  bam_hdr_t *h;
69  khint_t k;
70  h = bam_hdr_init();
71  h->sdict = d;
72  h->n_targets = kh_size(d);
73  h->target_len = (uint32_t*)malloc(4 * h->n_targets);
74  h->target_name = (char**)malloc(sizeof(char*) * h->n_targets);
75  for (k = kh_begin(d); k != kh_end(d); ++k) {
76  if (!kh_exist(d, k)) continue;
77  h->target_name[kh_val(d, k)>>32] = (char*)kh_key(d, k);
78  h->target_len[kh_val(d, k)>>32] = kh_val(d, k)<<32>>32;
79  kh_val(d, k) >>= 32;
80  }
81  return h;
82 }
83 
85 {
86  bam_hdr_t *h;
87  char buf[4];
88  int magic_len, has_EOF;
89  int32_t i = 1, name_len;
90  // check EOF
91  has_EOF = bgzf_check_EOF(fp);
92  if (has_EOF < 0) {
93  perror("[W::sam_hdr_read] bgzf_check_EOF");
94  } else if (has_EOF == 0 && hts_verbose >= 2)
95  fprintf(stderr, "[W::%s] EOF marker is absent. The input is probably truncated.\n", __func__);
96  // read "BAM1"
97  magic_len = bgzf_read(fp, buf, 4);
98  if (magic_len != 4 || strncmp(buf, "BAM\1", 4)) {
99  if (hts_verbose >= 1) fprintf(stderr, "[E::%s] invalid BAM binary header\n", __func__);
100  return 0;
101  }
102  h = bam_hdr_init();
103  // read plain text and the number of reference sequences
104  bgzf_read(fp, &h->l_text, 4);
105  if (fp->is_be) ed_swap_4p(&h->l_text);
106  h->text = (char*)malloc(h->l_text + 1);
107  h->text[h->l_text] = 0; // make sure it is NULL terminated
108  bgzf_read(fp, h->text, h->l_text);
109  bgzf_read(fp, &h->n_targets, 4);
110  if (fp->is_be) ed_swap_4p(&h->n_targets);
111  // read reference sequence names and lengths
112  h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
113  h->target_len = (uint32_t*)calloc(h->n_targets, 4);
114  for (i = 0; i != h->n_targets; ++i) {
115  bgzf_read(fp, &name_len, 4);
116  if (fp->is_be) ed_swap_4p(&name_len);
117  h->target_name[i] = (char*)calloc(name_len, 1);
118  bgzf_read(fp, h->target_name[i], name_len);
119  bgzf_read(fp, &h->target_len[i], 4);
120  if (fp->is_be) ed_swap_4p(&h->target_len[i]);
121  }
122  return h;
123 }
124 
125 int bam_hdr_write(BGZF *fp, const bam_hdr_t *h)
126 {
127  char buf[4];
128  int32_t i, name_len, x;
129  // write "BAM1"
130  strncpy(buf, "BAM\1", 4);
131  bgzf_write(fp, buf, 4);
132  // write plain text and the number of reference sequences
133  if (fp->is_be) {
134  x = ed_swap_4(h->l_text);
135  bgzf_write(fp, &x, 4);
136  if (h->l_text) bgzf_write(fp, h->text, h->l_text);
137  x = ed_swap_4(h->n_targets);
138  bgzf_write(fp, &x, 4);
139  } else {
140  bgzf_write(fp, &h->l_text, 4);
141  if (h->l_text) bgzf_write(fp, h->text, h->l_text);
142  bgzf_write(fp, &h->n_targets, 4);
143  }
144  // write sequence names and lengths
145  for (i = 0; i != h->n_targets; ++i) {
146  char *p = h->target_name[i];
147  name_len = strlen(p) + 1;
148  if (fp->is_be) {
149  x = ed_swap_4(name_len);
150  bgzf_write(fp, &x, 4);
151  } else bgzf_write(fp, &name_len, 4);
152  bgzf_write(fp, p, name_len);
153  if (fp->is_be) {
154  x = ed_swap_4(h->target_len[i]);
155  bgzf_write(fp, &x, 4);
156  } else bgzf_write(fp, &h->target_len[i], 4);
157  }
158  bgzf_flush(fp);
159  return 0;
160 }
161 
162 int bam_name2id(bam_hdr_t *h, const char *ref)
163 {
164  sdict_t *d = (sdict_t*)h->sdict;
165  khint_t k;
166  if (h->sdict == 0) {
167  int i, absent;
168  d = kh_init(s2i);
169  for (i = 0; i < h->n_targets; ++i) {
170  k = kh_put(s2i, d, h->target_name[i], &absent);
171  kh_val(d, k) = i;
172  }
173  h->sdict = d;
174  }
175  k = kh_get(s2i, d, ref);
176  return k == kh_end(d)? -1 : kh_val(d, k);
177 }
178 
179 /*************************
180  *** BAM alignment I/O ***
181  *************************/
182 
184 {
185  return (bam1_t*)calloc(1, sizeof(bam1_t));
186 }
187 
189 {
190  if (b == 0) return;
191  free(b->data); free(b);
192 }
193 
194 bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
195 {
196  uint8_t *data = bdst->data;
197  int m_data = bdst->m_data; // backup data and m_data
198  if (m_data < bsrc->l_data) { // double the capacity
199  m_data = bsrc->l_data; kroundup32(m_data);
200  data = (uint8_t*)realloc(data, m_data);
201  }
202  memcpy(data, bsrc->data, bsrc->l_data); // copy var-len data
203  *bdst = *bsrc; // copy the rest
204  // restore the backup
205  bdst->m_data = m_data;
206  bdst->data = data;
207  return bdst;
208 }
209 
210 bam1_t *bam_dup1(const bam1_t *bsrc)
211 {
212  if (bsrc == NULL) return NULL;
213  bam1_t *bdst = bam_init1();
214  if (bdst == NULL) return NULL;
215  return bam_copy1(bdst, bsrc);
216 }
217 
218 int bam_cigar2qlen(int n_cigar, const uint32_t *cigar)
219 {
220  int k, l;
221  for (k = l = 0; k < n_cigar; ++k)
222  if (bam_cigar_type(bam_cigar_op(cigar[k]))&1)
223  l += bam_cigar_oplen(cigar[k]);
224  return l;
225 }
226 
227 int bam_cigar2rlen(int n_cigar, const uint32_t *cigar)
228 {
229  int k, l;
230  for (k = l = 0; k < n_cigar; ++k)
231  if (bam_cigar_type(bam_cigar_op(cigar[k]))&2)
232  l += bam_cigar_oplen(cigar[k]);
233  return l;
234 }
235 
237 {
238  if (!(b->core.flag & BAM_FUNMAP) && b->core.n_cigar > 0)
239  return b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
240  else
241  return b->core.pos + 1;
242 }
243 
244 static inline int aux_type2size(uint8_t type)
245 {
246  switch (type) {
247  case 'A': case 'c': case 'C':
248  return 1;
249  case 's': case 'S':
250  return 2;
251  case 'i': case 'I': case 'f':
252  return 4;
253  case 'd':
254  return 8;
255  case 'Z': case 'H': case 'B':
256  return type;
257  default:
258  return 0;
259  }
260 }
261 
262 static void swap_data(const bam1_core_t *c, int l_data, uint8_t *data, int is_host)
263 {
264  uint8_t *s;
265  uint32_t *cigar = (uint32_t*)(data + c->l_qname);
266  uint32_t i, n;
267  s = data + c->n_cigar*4 + c->l_qname + c->l_qseq + (c->l_qseq + 1)/2;
268  for (i = 0; i < c->n_cigar; ++i) ed_swap_4p(&cigar[i]);
269  while (s < data + l_data) {
270  int size;
271  s += 2; // skip key
272  size = aux_type2size(*s); ++s; // skip type
273  switch (size) {
274  case 1: ++s; break;
275  case 2: ed_swap_2p(s); s += 2; break;
276  case 4: ed_swap_4p(s); s += 4; break;
277  case 8: ed_swap_8p(s); s += 8; break;
278  case 'Z':
279  case 'H':
280  while (*s) ++s;
281  ++s;
282  break;
283  case 'B':
284  size = aux_type2size(*s); ++s;
285  if (is_host) memcpy(&n, s, 4), ed_swap_4p(s);
286  else ed_swap_4p(s), memcpy(&n, s, 4);
287  s += 4;
288  switch (size) {
289  case 1: s += n; break;
290  case 2: for (i = 0; i < n; ++i, s += 2) ed_swap_2p(s); break;
291  case 4: for (i = 0; i < n; ++i, s += 4) ed_swap_4p(s); break;
292  case 8: for (i = 0; i < n; ++i, s += 8) ed_swap_8p(s); break;
293  }
294  break;
295  }
296  }
297 }
298 
299 int bam_read1(BGZF *fp, bam1_t *b)
300 {
301  bam1_core_t *c = &b->core;
302  int32_t block_len, ret, i;
303  uint32_t x[8];
304  if ((ret = bgzf_read(fp, &block_len, 4)) != 4) {
305  if (ret == 0) return -1; // normal end-of-file
306  else return -2; // truncated
307  }
308  if (bgzf_read(fp, x, 32) != 32) return -3;
309  if (fp->is_be) {
310  ed_swap_4p(&block_len);
311  for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
312  }
313  c->tid = x[0]; c->pos = x[1];
314  c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff;
315  c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff;
316  c->l_qseq = x[4];
317  c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7];
318  b->l_data = block_len - 32;
319  if (b->l_data < 0 || c->l_qseq < 0) return -4;
320  if ((char *)bam_get_aux(b) - (char *)b->data > b->l_data)
321  return -4;
322  if (b->m_data < b->l_data) {
323  b->m_data = b->l_data;
324  kroundup32(b->m_data);
325  b->data = (uint8_t*)realloc(b->data, b->m_data);
326  if (!b->data)
327  return -4;
328  }
329  if (bgzf_read(fp, b->data, b->l_data) != b->l_data) return -4;
330  //b->l_aux = b->l_data - c->n_cigar * 4 - c->l_qname - c->l_qseq - (c->l_qseq+1)/2;
331  if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
332  return 4 + block_len;
333 }
334 
335 int bam_write1(BGZF *fp, const bam1_t *b)
336 {
337  const bam1_core_t *c = &b->core;
338  uint32_t x[8], block_len = b->l_data + 32, y;
339  int i, ok;
340  x[0] = c->tid;
341  x[1] = c->pos;
342  x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | c->l_qname;
343  x[3] = (uint32_t)c->flag<<16 | c->n_cigar;
344  x[4] = c->l_qseq;
345  x[5] = c->mtid;
346  x[6] = c->mpos;
347  x[7] = c->isize;
348  ok = (bgzf_flush_try(fp, 4 + block_len) >= 0);
349  if (fp->is_be) {
350  for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
351  y = block_len;
352  if (ok) ok = (bgzf_write(fp, ed_swap_4p(&y), 4) >= 0);
353  swap_data(c, b->l_data, b->data, 1);
354  } else {
355  if (ok) ok = (bgzf_write(fp, &block_len, 4) >= 0);
356  }
357  if (ok) ok = (bgzf_write(fp, x, 32) >= 0);
358  if (ok) ok = (bgzf_write(fp, b->data, b->l_data) >= 0);
359  if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
360  return ok? 4 + block_len : -1;
361 }
362 
363 /********************
364  *** BAM indexing ***
365  ********************/
366 
367 static hts_idx_t *bam_index(BGZF *fp, int min_shift)
368 {
369  int n_lvls, i, fmt;
370  bam1_t *b;
371  hts_idx_t *idx;
372  bam_hdr_t *h;
373  h = bam_hdr_read(fp);
374  if (min_shift > 0) {
375  int64_t max_len = 0, s;
376  for (i = 0; i < h->n_targets; ++i)
377  if (max_len < h->target_len[i]) max_len = h->target_len[i];
378  max_len += 256;
379  for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
380  fmt = HTS_FMT_CSI;
381  } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
382  idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp), min_shift, n_lvls);
383  bam_hdr_destroy(h);
384  b = bam_init1();
385  while (bam_read1(fp, b) >= 0) {
386  int l, ret;
388  if (l == 0) l = 1; // no zero-length records
389  ret = hts_idx_push(idx, b->core.tid, b->core.pos, b->core.pos + l, bgzf_tell(fp), !(b->core.flag&BAM_FUNMAP));
390  if (ret < 0)
391  {
392  // unsorted
393  bam_destroy1(b);
394  hts_idx_destroy(idx);
395  return NULL;
396  }
397  }
398  hts_idx_finish(idx, bgzf_tell(fp));
399  bam_destroy1(b);
400  return idx;
401 }
402 
403 int bam_index_build(const char *fn, int min_shift)
404 {
405  hts_idx_t *idx;
406  htsFile *fp;
407  int ret = 0;
408 
409  if ((fp = hts_open(fn, "r")) == 0) return -1;
410  if (fp->is_cram) {
411  ret = cram_index_build(fp->fp.cram, fn);
412  } else {
413  idx = bam_index(fp->fp.bgzf, min_shift);
414  if ( !idx )
415  {
416  hts_close(fp);
417  return -1;
418  }
419  hts_idx_save(idx, fn, min_shift > 0
421  hts_idx_destroy(idx);
422  }
423  hts_close(fp);
424 
425  return ret;
426 }
427 
428 static int bam_readrec(BGZF *fp, void *ignored, void *bv, int *tid, int *beg, int *end)
429 {
430  bam1_t *b = bv;
431  int ret;
432  if ((ret = bam_read1(fp, b)) >= 0) {
433  *tid = b->core.tid; *beg = b->core.pos;
434  *end = b->core.pos + (b->core.n_cigar? bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)) : 1);
435  }
436  return ret;
437 }
438 
439 // This is used only with read_rest=1 iterators, so need not set tid/beg/end.
440 static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, int *beg, int *end)
441 {
442  htsFile *fp = fpv;
443  bam1_t *b = bv;
444  return cram_get_bam_seq(fp->fp.cram, &b);
445 }
446 
447 // This is used only with read_rest=1 iterators, so need not set tid/beg/end.
448 static int sam_bam_cram_readrec(BGZF *bgzfp, void *fpv, void *bv, int *tid, int *beg, int *end)
449 {
450  htsFile *fp = fpv;
451  bam1_t *b = bv;
452  if (fp->is_bin) return bam_read1(bgzfp, b);
453  else if (fp->is_cram) return cram_get_bam_seq(fp->fp.cram, &b);
454  else {
455  // TODO Need headers available to implement this for SAM files
456  fprintf(stderr, "[sam_bam_cram_readrec] Not implemented for SAM files -- Exiting\n");
457  abort();
458  }
459 }
460 
461 // The CRAM implementation stores the loaded index within the cram_fd rather
462 // than separately as is done elsewhere in htslib. So if p is a pointer to
463 // an hts_idx_t with p->fmt == HTS_FMT_CRAI, then it actually points to an
464 // hts_cram_idx_t and should be cast accordingly.
465 typedef struct hts_cram_idx_t {
466  int fmt;
469 
470 hts_idx_t *sam_index_load(samFile *fp, const char *fn)
471 {
472  if (fp->is_bin) return bam_index_load(fn);
473  else if (fp->is_cram) {
474  if (cram_index_load(fp->fp.cram, fn) < 0) return NULL;
475  // Cons up a fake "index" just pointing at the associated cram_fd:
476  hts_cram_idx_t *idx = malloc(sizeof (hts_cram_idx_t));
477  if (idx == NULL) return NULL;
478  idx->fmt = HTS_FMT_CRAI;
479  idx->cram = fp->fp.cram;
480  return (hts_idx_t *) idx;
481  }
482  else return NULL; // TODO Would use tbx_index_load if it returned hts_idx_t
483 }
484 
485 static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec)
486 {
487  const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
488  hts_itr_t *iter = (hts_itr_t *) calloc(1, sizeof(hts_itr_t));
489  if (iter == NULL) return NULL;
490 
491  // Cons up a dummy iterator for which hts_itr_next() will simply invoke
492  // the readrec function:
493  iter->read_rest = 1;
494  iter->off = NULL;
495  iter->bins.a = NULL;
496  iter->readrec = readrec;
497 
498  if (tid >= 0) {
499  cram_range r = { tid, beg+1, end };
500  if (cram_set_option(cidx->cram, CRAM_OPT_RANGE, &r) != 0) { free(iter); return NULL; }
501  iter->curr_off = 0;
502  // The following fields are not required by hts_itr_next(), but are
503  // filled in in case user code wants to look at them.
504  iter->tid = tid;
505  iter->beg = beg;
506  iter->end = end;
507  }
508  else switch (tid) {
509  case HTS_IDX_REST:
510  iter->curr_off = 0;
511  break;
512  case HTS_IDX_NONE:
513  iter->curr_off = 0;
514  iter->finished = 1;
515  break;
516  default:
517  fprintf(stderr, "[cram_itr_query] tid=%d not implemented for CRAM files -- Exiting\n", tid);
518  abort();
519  break;
520  }
521 
522  return iter;
523 }
524 
525 hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end)
526 {
527  const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
528  if (idx == NULL)
529  return hts_itr_query(NULL, tid, beg, end, sam_bam_cram_readrec);
530  else if (cidx->fmt == HTS_FMT_CRAI)
531  return cram_itr_query(idx, tid, beg, end, cram_readrec);
532  else
533  return hts_itr_query(idx, tid, beg, end, bam_readrec);
534 }
535 
536 static int cram_name2id(void *fdv, const char *ref)
537 {
538  cram_fd *fd = (cram_fd *) fdv;
539  return sam_hdr_name2ref(fd->header, ref);
540 }
541 
542 hts_itr_t *sam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region)
543 {
544  const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
545  if (cidx->fmt == HTS_FMT_CRAI)
546  return hts_itr_querys(idx, region, cram_name2id, cidx->cram, cram_itr_query, cram_readrec);
547  else
548  return hts_itr_querys(idx, region, (hts_name2id_f)(bam_name2id), hdr, hts_itr_query, bam_readrec);
549 }
550 
551 /**********************
552  *** SAM header I/O ***
553  **********************/
554 
555 #include "htslib/kseq.h"
556 #include "htslib/kstring.h"
557 
558 bam_hdr_t *sam_hdr_parse(int l_text, const char *text)
559 {
560  const char *q, *r, *p;
561  khash_t(s2i) *d;
562  d = kh_init(s2i);
563  for (p = text; *p; ++p) {
564  if (strncmp(p, "@SQ", 3) == 0) {
565  char *sn = 0;
566  int ln = -1;
567  for (q = p + 4;; ++q) {
568  if (strncmp(q, "SN:", 3) == 0) {
569  q += 3;
570  for (r = q; *r != '\t' && *r != '\n'; ++r);
571  sn = (char*)calloc(r - q + 1, 1);
572  strncpy(sn, q, r - q);
573  q = r;
574  } else if (strncmp(q, "LN:", 3) == 0)
575  ln = strtol(q + 3, (char**)&q, 10);
576  while (*q != '\t' && *q != '\n') ++q;
577  if (*q == '\n') break;
578  }
579  p = q;
580  if (sn && ln >= 0) {
581  khint_t k;
582  int absent;
583  k = kh_put(s2i, d, sn, &absent);
584  if (!absent) {
585  if (hts_verbose >= 2)
586  fprintf(stderr, "[W::%s] duplicated sequence '%s'\n", __func__, sn);
587  free(sn);
588  } else kh_val(d, k) = (int64_t)(kh_size(d) - 1)<<32 | ln;
589  }
590  }
591  while (*p != '\n') ++p;
592  }
593  return hdr_from_dict(d);
594 }
595 
597 {
598  if (fp->is_bin) {
599  return bam_hdr_read(fp->fp.bgzf);
600  } else if (fp->is_cram) {
601  return cram_header_to_bam(fp->fp.cram->header);
602  } else {
603  kstring_t str;
604  bam_hdr_t *h;
605  int has_SQ = 0;
606  str.l = str.m = 0; str.s = 0;
607  while (hts_getline(fp, KS_SEP_LINE, &fp->line) >= 0) {
608  if (fp->line.s[0] != '@') break;
609  if (fp->line.l > 3 && strncmp(fp->line.s,"@SQ",3) == 0) has_SQ = 1;
610  kputsn(fp->line.s, fp->line.l, &str);
611  kputc('\n', &str);
612  }
613  if (! has_SQ && fp->fn_aux) {
614  char line[2048];
615  FILE *f = fopen(fp->fn_aux, "r");
616  if (f == NULL) return NULL;
617  while (fgets(line, sizeof line, f)) {
618  const char *name = strtok(line, "\t");
619  const char *length = strtok(NULL, "\t");
620  ksprintf(&str, "@SQ\tSN:%s\tLN:%s\n", name, length);
621  }
622  fclose(f);
623  }
624  if (str.l == 0) kputsn("", 0, &str);
625  h = sam_hdr_parse(str.l, str.s);
626  h->l_text = str.l; h->text = str.s;
627  return h;
628  }
629 }
630 
631 int sam_hdr_write(htsFile *fp, const bam_hdr_t *h)
632 {
633  if (fp->is_bin) {
634  bam_hdr_write(fp->fp.bgzf, h);
635  } else if (fp->is_cram) {
636  cram_fd *fd = fp->fp.cram;
637  if (cram_set_header(fd, bam_header_to_cram((bam_hdr_t *)h)) < 0) return -1;
638  if (fp->fn_aux)
639  cram_load_reference(fd, fp->fn_aux);
640  if (cram_write_SAM_hdr(fd, fd->header) < 0) return -1;
641  } else {
642  char *p;
643  hputs(h->text, fp->fp.hfile);
644  p = strstr(h->text, "@SQ\t"); // FIXME: we need a loop to make sure "@SQ\t" does not match something unwanted!!!
645  if (p == 0) {
646  int i;
647  for (i = 0; i < h->n_targets; ++i) {
648  fp->line.l = 0;
649  kputsn("@SQ\tSN:", 7, &fp->line); kputs(h->target_name[i], &fp->line);
650  kputsn("\tLN:", 4, &fp->line); kputw(h->target_len[i], &fp->line); kputc('\n', &fp->line);
651  if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1;
652  }
653  }
654  if ( hflush(fp->fp.hfile) != 0 ) return -1;
655  }
656  return 0;
657 }
658 
659 /**********************
660  *** SAM record I/O ***
661  **********************/
662 
664 {
665 #define _read_token(_p) (_p); for (; *(_p) && *(_p) != '\t'; ++(_p)); if (*(_p) != '\t') goto err_ret; *(_p)++ = 0
666 #define _read_token_aux(_p) (_p); for (; *(_p) && *(_p) != '\t'; ++(_p)); *(_p)++ = 0 // this is different in that it does not test *(_p)=='\t'
667 #define _get_mem(type_t, _x, _s, _l) ks_resize((_s), (_s)->l + (_l)); *(_x) = (type_t*)((_s)->s + (_s)->l); (_s)->l += (_l)
668 #define _parse_err(cond, msg) do { if ((cond) && hts_verbose >= 1) { fprintf(stderr, "[E::%s] " msg "\n", __func__); goto err_ret; } } while (0)
669 #define _parse_warn(cond, msg) if ((cond) && hts_verbose >= 2) fprintf(stderr, "[W::%s] " msg "\n", __func__)
670 
671  uint8_t *t;
672  char *p = s->s, *q;
673  int i;
674  kstring_t str;
675  bam1_core_t *c = &b->core;
676 
677  str.l = b->l_data = 0;
678  str.s = (char*)b->data; str.m = b->m_data;
679  memset(c, 0, 32);
680  if (h->cigar_tab == 0) {
681  h->cigar_tab = (int8_t*) malloc(128);
682  for (i = 0; i < 128; ++i)
683  h->cigar_tab[i] = -1;
684  for (i = 0; BAM_CIGAR_STR[i]; ++i)
685  h->cigar_tab[(int)BAM_CIGAR_STR[i]] = i;
686  }
687  // qname
688  q = _read_token(p);
689  kputsn_(q, p - q, &str);
690  c->l_qname = p - q;
691  // flag
692  c->flag = strtol(p, &p, 0);
693  if (*p++ != '\t') goto err_ret; // malformated flag
694  // chr
695  q = _read_token(p);
696  if (strcmp(q, "*")) {
697  _parse_err(h->n_targets == 0, "missing SAM header");
698  c->tid = bam_name2id(h, q);
699  _parse_warn(c->tid < 0, "urecognized reference name; treated as unmapped");
700  } else c->tid = -1;
701  // pos
702  c->pos = strtol(p, &p, 10) - 1;
703  if (*p++ != '\t') goto err_ret;
704  if (c->pos < 0 && c->tid >= 0) {
705  _parse_warn(1, "mapped query cannot have zero coordinate; treated as unmapped");
706  c->tid = -1;
707  }
708  if (c->tid < 0) c->flag |= BAM_FUNMAP;
709  // mapq
710  c->qual = strtol(p, &p, 10);
711  if (*p++ != '\t') goto err_ret;
712  // cigar
713  if (*p != '*') {
714  uint32_t *cigar;
715  size_t n_cigar = 0;
716  for (q = p; *p && *p != '\t'; ++p)
717  if (!isdigit(*p)) ++n_cigar;
718  if (*p++ != '\t') goto err_ret;
719  _parse_err(n_cigar >= 65536, "too many CIGAR operations");
720  c->n_cigar = n_cigar;
721  _get_mem(uint32_t, &cigar, &str, c->n_cigar<<2);
722  for (i = 0; i < c->n_cigar; ++i, ++q) {
723  int op;
724  cigar[i] = strtol(q, &q, 10)<<BAM_CIGAR_SHIFT;
725  op = (uint8_t)*q >= 128? -1 : h->cigar_tab[(int)*q];
726  _parse_err(op < 0, "unrecognized CIGAR operator");
727  cigar[i] |= op;
728  }
729  i = bam_cigar2rlen(c->n_cigar, cigar);
730  } else {
731  _parse_warn(!(c->flag&BAM_FUNMAP), "mapped query must have a CIGAR; treated as unmapped");
732  c->flag |= BAM_FUNMAP;
733  q = _read_token(p);
734  i = 1;
735  }
736  c->bin = hts_reg2bin(c->pos, c->pos + i, 14, 5);
737  // mate chr
738  q = _read_token(p);
739  if (strcmp(q, "=") == 0) c->mtid = c->tid;
740  else if (strcmp(q, "*") == 0) c->mtid = -1;
741  else c->mtid = bam_name2id(h, q);
742  // mpos
743  c->mpos = strtol(p, &p, 10) - 1;
744  if (*p++ != '\t') goto err_ret;
745  if (c->mpos < 0 && c->mtid >= 0) {
746  _parse_warn(1, "mapped mate cannot have zero coordinate; treated as unmapped");
747  c->mtid = -1;
748  }
749  // tlen
750  c->isize = strtol(p, &p, 10);
751  if (*p++ != '\t') goto err_ret;
752  // seq
753  q = _read_token(p);
754  if (strcmp(q, "*")) {
755  c->l_qseq = p - q - 1;
756  i = bam_cigar2qlen(c->n_cigar, (uint32_t*)(str.s + c->l_qname));
757  _parse_err(c->n_cigar && i != c->l_qseq, "CIGAR and query sequence are of different length");
758  i = (c->l_qseq + 1) >> 1;
759  _get_mem(uint8_t, &t, &str, i);
760  memset(t, 0, i);
761  for (i = 0; i < c->l_qseq; ++i)
762  t[i>>1] |= seq_nt16_table[(int)q[i]] << ((~i&1)<<2);
763  } else c->l_qseq = 0;
764  // qual
765  q = _read_token_aux(p);
766  _get_mem(uint8_t, &t, &str, c->l_qseq);
767  if (strcmp(q, "*")) {
768  _parse_err(p - q - 1 != c->l_qseq, "SEQ and QUAL are of different length");
769  for (i = 0; i < c->l_qseq; ++i) t[i] = q[i] - 33;
770  } else memset(t, 0xff, c->l_qseq);
771  // aux
772  // Note that (like the bam1_core_t fields) this aux data in b->data is
773  // stored in host endianness; so there is no byte swapping needed here.
774  while (p < s->s + s->l) {
775  uint8_t type;
776  q = _read_token_aux(p); // FIXME: can be accelerated for long 'B' arrays
777  _parse_err(p - q - 1 < 6, "incomplete aux field");
778  kputsn_(q, 2, &str);
779  q += 3; type = *q++; ++q; // q points to value
780  if (type == 'A' || type == 'a' || type == 'c' || type == 'C') {
781  kputc_('A', &str);
782  kputc_(*q, &str);
783  } else if (type == 'i' || type == 'I') {
784  long x;
785  x = strtol(q, &q, 10);
786  if (x < 0) {
787  if (x >= INT8_MIN) {
788  kputc_('c', &str); kputc_(x, &str);
789  } else if (x >= INT16_MIN) {
790  int16_t y = x;
791  kputc_('s', &str); kputsn_((char*)&y, 2, &str);
792  } else {
793  int32_t y = x;
794  kputc_('i', &str); kputsn_(&y, 4, &str);
795  }
796  } else {
797  if (x <= UINT8_MAX) {
798  kputc_('C', &str); kputc_(x, &str);
799  } else if (x <= UINT16_MAX) {
800  uint16_t y = x;
801  kputc_('S', &str); kputsn_(&y, 2, &str);
802  } else {
803  uint32_t y = x;
804  kputc_('I', &str); kputsn_(&y, 4, &str);
805  }
806  }
807  } else if (type == 'f') {
808  float x;
809  x = strtod(q, &q);
810  kputc_('f', &str); kputsn_(&x, 4, &str);
811  } else if (type == 'd') {
812  double x;
813  x = strtod(q, &q);
814  kputc_('d', &str); kputsn_(&x, 8, &str);
815  } else if (type == 'Z' || type == 'H') {
816  kputc_(type, &str);kputsn_(q, p - q, &str); // note that this include the trailing NULL
817  } else if (type == 'B') {
818  int32_t n;
819  char *r;
820  _parse_err(p - q - 1 < 3, "incomplete B-typed aux field");
821  type = *q++; // q points to the first ',' following the typing byte
822  for (r = q, n = 0; *r; ++r)
823  if (*r == ',') ++n;
824  kputc_('B', &str); kputc_(type, &str); kputsn_(&n, 4, &str);
825  // FIXME: to evaluate which is faster: a) aligned array and then memmove(); b) unaligned array; c) kputsn_()
826  if (type == 'c') while (q + 1 < p) { int8_t x = strtol(q + 1, &q, 0); kputc_(x, &str); }
827  else if (type == 'C') while (q + 1 < p) { uint8_t x = strtoul(q + 1, &q, 0); kputc_(x, &str); }
828  else if (type == 's') while (q + 1 < p) { int16_t x = strtol(q + 1, &q, 0); kputsn_(&x, 2, &str); }
829  else if (type == 'S') while (q + 1 < p) { uint16_t x = strtoul(q + 1, &q, 0); kputsn_(&x, 2, &str); }
830  else if (type == 'i') while (q + 1 < p) { int32_t x = strtol(q + 1, &q, 0); kputsn_(&x, 4, &str); }
831  else if (type == 'I') while (q + 1 < p) { uint32_t x = strtoul(q + 1, &q, 0); kputsn_(&x, 4, &str); }
832  else if (type == 'f') while (q + 1 < p) { float x = strtod(q + 1, &q); kputsn_(&x, 4, &str); }
833  else _parse_err(1, "unrecognized type");
834  } else _parse_err(1, "unrecognized type");
835  }
836  b->data = (uint8_t*)str.s; b->l_data = str.l; b->m_data = str.m;
837  return 0;
838 
839 #undef _parse_warn
840 #undef _parse_err
841 #undef _get_mem
842 #undef _read_token_aux
843 #undef _read_token
844 err_ret:
845  b->data = (uint8_t*)str.s; b->l_data = str.l; b->m_data = str.m;
846  return -2;
847 }
848 
850 {
851  if (fp->is_bin) {
852  int r = bam_read1(fp->fp.bgzf, b);
853  if (r >= 0) {
854  if (b->core.tid >= h->n_targets || b->core.tid < -1 ||
855  b->core.mtid >= h->n_targets || b->core.mtid < -1)
856  return -3;
857  }
858  return r;
859  } else if (fp->is_cram) {
860  return cram_get_bam_seq(fp->fp.cram, &b);
861  } else {
862  int ret;
863 err_recover:
864  if (fp->line.l == 0) {
865  ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
866  if (ret < 0) return -1;
867  }
868  ret = sam_parse1(&fp->line, h, b);
869  fp->line.l = 0;
870  if (ret < 0) {
871  if (hts_verbose >= 1)
872  fprintf(stderr, "[W::%s] parse error at line %lld\n", __func__, (long long)fp->lineno);
873  if (h->ignore_sam_err) goto err_recover;
874  }
875  return ret;
876  }
877 }
878 
879 int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
880 {
881  int i;
882  uint8_t *s;
883  const bam1_core_t *c = &b->core;
884 
885  str->l = 0;
886  kputsn(bam_get_qname(b), c->l_qname-1, str); kputc('\t', str); // query name
887  kputw(c->flag, str); kputc('\t', str); // flag
888  if (c->tid >= 0) { // chr
889  kputs(h->target_name[c->tid] , str);
890  kputc('\t', str);
891  } else kputsn("*\t", 2, str);
892  kputw(c->pos + 1, str); kputc('\t', str); // pos
893  kputw(c->qual, str); kputc('\t', str); // qual
894  if (c->n_cigar) { // cigar
895  uint32_t *cigar = bam_get_cigar(b);
896  for (i = 0; i < c->n_cigar; ++i) {
897  kputw(bam_cigar_oplen(cigar[i]), str);
898  kputc(bam_cigar_opchr(cigar[i]), str);
899  }
900  } else kputc('*', str);
901  kputc('\t', str);
902  if (c->mtid < 0) kputsn("*\t", 2, str); // mate chr
903  else if (c->mtid == c->tid) kputsn("=\t", 2, str);
904  else {
905  kputs(h->target_name[c->mtid], str);
906  kputc('\t', str);
907  }
908  kputw(c->mpos + 1, str); kputc('\t', str); // mate pos
909  kputw(c->isize, str); kputc('\t', str); // template len
910  if (c->l_qseq) { // seq and qual
911  uint8_t *s = bam_get_seq(b);
912  for (i = 0; i < c->l_qseq; ++i) kputc("=ACMGRSVTWYHKDBN"[bam_seqi(s, i)], str);
913  kputc('\t', str);
914  s = bam_get_qual(b);
915  if (s[0] == 0xff) kputc('*', str);
916  else for (i = 0; i < c->l_qseq; ++i) kputc(s[i] + 33, str);
917  } else kputsn("*\t*", 3, str);
918  s = bam_get_aux(b); // aux
919  while (s+4 <= b->data + b->l_data) {
920  uint8_t type, key[2];
921  key[0] = s[0]; key[1] = s[1];
922  s += 2; type = *s++;
923  kputc('\t', str); kputsn((char*)key, 2, str); kputc(':', str);
924  if (type == 'A') {
925  kputsn("A:", 2, str);
926  kputc(*s, str);
927  ++s;
928  } else if (type == 'C') {
929  kputsn("i:", 2, str);
930  kputw(*s, str);
931  ++s;
932  } else if (type == 'c') {
933  kputsn("i:", 2, str);
934  kputw(*(int8_t*)s, str);
935  ++s;
936  } else if (type == 'S') {
937  if (s+2 <= b->data + b->l_data) {
938  kputsn("i:", 2, str);
939  kputw(*(uint16_t*)s, str);
940  s += 2;
941  } else return -1;
942  } else if (type == 's') {
943  if (s+2 <= b->data + b->l_data) {
944  kputsn("i:", 2, str);
945  kputw(*(int16_t*)s, str);
946  s += 2;
947  } else return -1;
948  } else if (type == 'I') {
949  if (s+4 <= b->data + b->l_data) {
950  kputsn("i:", 2, str);
951  kputuw(*(uint32_t*)s, str);
952  s += 4;
953  } else return -1;
954  } else if (type == 'i') {
955  if (s+4 <= b->data + b->l_data) {
956  kputsn("i:", 2, str);
957  kputw(*(int32_t*)s, str);
958  s += 4;
959  } else return -1;
960  } else if (type == 'f') {
961  if (s+4 <= b->data + b->l_data) {
962  ksprintf(str, "f:%g", *(float*)s);
963  s += 4;
964  } else return -1;
965 
966  } else if (type == 'd') {
967  if (s+8 <= b->data + b->l_data) {
968  ksprintf(str, "d:%g", *(double*)s);
969  s += 8;
970  } else return -1;
971  } else if (type == 'Z' || type == 'H') {
972  kputc(type, str); kputc(':', str);
973  while (s < b->data + b->l_data && *s) kputc(*s++, str);
974  if (s >= b->data + b->l_data)
975  return -1;
976  ++s;
977  } else if (type == 'B') {
978  uint8_t sub_type = *(s++);
979  int32_t n;
980  memcpy(&n, s, 4);
981  s += 4; // no point to the start of the array
982  if (s + n >= b->data + b->l_data)
983  return -1;
984  kputsn("B:", 2, str); kputc(sub_type, str); // write the typing
985  for (i = 0; i < n; ++i) { // FIXME: for better performance, put the loop after "if"
986  kputc(',', str);
987  if ('c' == sub_type) { kputw(*(int8_t*)s, str); ++s; }
988  else if ('C' == sub_type) { kputw(*(uint8_t*)s, str); ++s; }
989  else if ('s' == sub_type) { kputw(*(int16_t*)s, str); s += 2; }
990  else if ('S' == sub_type) { kputw(*(uint16_t*)s, str); s += 2; }
991  else if ('i' == sub_type) { kputw(*(int32_t*)s, str); s += 4; }
992  else if ('I' == sub_type) { kputuw(*(uint32_t*)s, str); s += 4; }
993  else if ('f' == sub_type) { ksprintf(str, "%g", *(float*)s); s += 4; }
994  }
995  }
996  }
997  return str->l;
998 }
999 
1000 int sam_write1(htsFile *fp, const bam_hdr_t *h, const bam1_t *b)
1001 {
1002  if (fp->is_bin) {
1003  return bam_write1(fp->fp.bgzf, b);
1004  } else if (fp->is_cram) {
1005  return cram_put_bam_seq(fp->fp.cram, (bam1_t *)b);
1006  } else {
1007  if (sam_format1(h, b, &fp->line) < 0) return -1;
1008  kputc('\n', &fp->line);
1009  if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1;
1010  return fp->line.l;
1011  }
1012 }
1013 
1014 /************************
1015  *** Auxiliary fields ***
1016  ************************/
1017 
1018 void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data)
1019 {
1020  int ori_len = b->l_data;
1021  b->l_data += 3 + len;
1022  if (b->m_data < b->l_data) {
1023  b->m_data = b->l_data;
1024  kroundup32(b->m_data);
1025  b->data = (uint8_t*)realloc(b->data, b->m_data);
1026  }
1027  b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1];
1028  b->data[ori_len + 2] = type;
1029  memcpy(b->data + ori_len + 3, data, len);
1030 }
1031 
1032 static inline uint8_t *skip_aux(uint8_t *s)
1033 {
1034  int size = aux_type2size(*s); ++s; // skip type
1035  uint32_t n;
1036  switch (size) {
1037  case 'Z':
1038  case 'H':
1039  while (*s) ++s;
1040  return s + 1;
1041  case 'B':
1042  size = aux_type2size(*s); ++s;
1043  memcpy(&n, s, 4); s += 4;
1044  return s + size * n;
1045  case 0:
1046  abort();
1047  break;
1048  default:
1049  return s + size;
1050  }
1051 }
1052 
1053 uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
1054 {
1055  uint8_t *s;
1056  int y = tag[0]<<8 | tag[1];
1057  s = bam_get_aux(b);
1058  while (s < b->data + b->l_data) {
1059  int x = (int)s[0]<<8 | s[1];
1060  s += 2;
1061  if (x == y) return s;
1062  s = skip_aux(s);
1063  }
1064  return 0;
1065 }
1066 // s MUST BE returned by bam_aux_get()
1068 {
1069  uint8_t *p, *aux;
1070  int l_aux = bam_get_l_aux(b);
1071  aux = bam_get_aux(b);
1072  p = s - 2;
1073  s = skip_aux(s);
1074  memmove(p, s, l_aux - (s - aux));
1075  b->l_data -= s - p;
1076  return 0;
1077 }
1078 
1080 {
1081  int type;
1082  type = *s++;
1083  if (type == 'c') return (int32_t)*(int8_t*)s;
1084  else if (type == 'C') return (int32_t)*(uint8_t*)s;
1085  else if (type == 's') return (int32_t)*(int16_t*)s;
1086  else if (type == 'S') return (int32_t)*(uint16_t*)s;
1087  else if (type == 'i' || type == 'I') return *(int32_t*)s;
1088  else return 0;
1089 }
1090 
1091 double bam_aux2f(const uint8_t *s)
1092 {
1093  int type;
1094  type = *s++;
1095  if (type == 'd') return *(double*)s;
1096  else if (type == 'f') return *(float*)s;
1097  else return 0.0;
1098 }
1099 
1100 char bam_aux2A(const uint8_t *s)
1101 {
1102  int type;
1103  type = *s++;
1104  if (type == 'A') return *(char*)s;
1105  else return 0;
1106 }
1107 
1108 char *bam_aux2Z(const uint8_t *s)
1109 {
1110  int type;
1111  type = *s++;
1112  if (type == 'Z' || type == 'H') return (char*)s;
1113  else return 0;
1114 }
1115 
1116 int sam_open_mode(char *mode, const char *fn, const char *format)
1117 {
1118  // TODO Parse "bam5" etc for compression level
1119  if (format == NULL) {
1120  // Try to pick a format based on the filename extension
1121  const char *ext = fn? strrchr(fn, '.') : NULL;
1122  if (ext == NULL || strchr(ext, '/')) return -1;
1123  return sam_open_mode(mode, fn, ext+1);
1124  }
1125  else if (strcmp(format, "bam") == 0) strcpy(mode, "b");
1126  else if (strcmp(format, "cram") == 0) strcpy(mode, "c");
1127  else if (strcmp(format, "sam") == 0) strcpy(mode, "");
1128  else return -1;
1129 
1130  return 0;
1131 }
1132 
1133 #define STRNCMP(a,b,n) (strncasecmp((a),(b),(n)) || strlen(a)!=(n))
1134 int bam_str2flag(const char *str)
1135 {
1136  char *end, *beg = (char*) str;
1137  long int flag = strtol(str, &end, 0);
1138  if ( end!=str ) return flag; // the conversion was successful
1139  flag = 0;
1140  while ( *str )
1141  {
1142  end = beg;
1143  while ( *end && *end!=',' ) end++;
1144  if ( !STRNCMP("PAIRED",beg,end-beg) ) flag |= BAM_FPAIRED;
1145  else if ( !STRNCMP("PROPER_PAIR",beg,end-beg) ) flag |= BAM_FPROPER_PAIR;
1146  else if ( !STRNCMP("UNMAP",beg,end-beg) ) flag |= BAM_FUNMAP;
1147  else if ( !STRNCMP("MUNMAP",beg,end-beg) ) flag |= BAM_FMUNMAP;
1148  else if ( !STRNCMP("REVERSE",beg,end-beg) ) flag |= BAM_FREVERSE;
1149  else if ( !STRNCMP("MREVERSE",beg,end-beg) ) flag |= BAM_FMREVERSE;
1150  else if ( !STRNCMP("READ1",beg,end-beg) ) flag |= BAM_FREAD1;
1151  else if ( !STRNCMP("READ2",beg,end-beg) ) flag |= BAM_FREAD2;
1152  else if ( !STRNCMP("SECONDARY",beg,end-beg) ) flag |= BAM_FSECONDARY;
1153  else if ( !STRNCMP("QCFAIL",beg,end-beg) ) flag |= BAM_FQCFAIL;
1154  else if ( !STRNCMP("DUP",beg,end-beg) ) flag |= BAM_FDUP;
1155  else if ( !STRNCMP("SUPPLEMENTARY",beg,end-beg) ) flag |= BAM_FSUPPLEMENTARY;
1156  else return -1;
1157  if ( !*end ) break;
1158  beg = end + 1;
1159  }
1160  return flag;
1161 }
1162 
1163 char *bam_flag2str(int flag)
1164 {
1165  kstring_t str = {0,0,0};
1166  if ( flag&BAM_FPAIRED ) ksprintf(&str,"%s%s", str.l?",":"","PAIRED");
1167  if ( flag&BAM_FPROPER_PAIR ) ksprintf(&str,"%s%s", str.l?",":"","PROPER_PAIR");
1168  if ( flag&BAM_FUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","UNMAP");
1169  if ( flag&BAM_FMUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","MUNMAP");
1170  if ( flag&BAM_FREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","REVERSE");
1171  if ( flag&BAM_FMREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","MREVERSE");
1172  if ( flag&BAM_FREAD1 ) ksprintf(&str,"%s%s", str.l?",":"","READ1");
1173  if ( flag&BAM_FREAD2 ) ksprintf(&str,"%s%s", str.l?",":"","READ2");
1174  if ( flag&BAM_FSECONDARY ) ksprintf(&str,"%s%s", str.l?",":"","SECONDARY");
1175  if ( flag&BAM_FQCFAIL ) ksprintf(&str,"%s%s", str.l?",":"","QCFAIL");
1176  if ( flag&BAM_FDUP ) ksprintf(&str,"%s%s", str.l?",":"","DUP");
1177  if ( flag&BAM_FSUPPLEMENTARY ) ksprintf(&str,"%s%s", str.l?",":"","SUPPLEMENTARY");
1178  if ( str.l == 0 ) kputsn("", 0, &str);
1179  return str.s;
1180 }
1181 
1182 
1183 /**************************
1184  *** Pileup and Mpileup ***
1185  **************************/
1186 
1187 #if !defined(BAM_NO_PILEUP)
1188 
1189 #include <assert.h>
1190 
1191 /*******************
1192  *** Memory pool ***
1193  *******************/
1194 
1195 typedef struct {
1196  int k, x, y, end;
1197 } cstate_t;
1198 
1199 static cstate_t g_cstate_null = { -1, 0, 0, 0 };
1200 
1201 typedef struct __linkbuf_t {
1206 } lbnode_t;
1207 
1208 typedef struct {
1209  int cnt, n, max;
1211 } mempool_t;
1212 
1213 static mempool_t *mp_init(void)
1214 {
1215  mempool_t *mp;
1216  mp = (mempool_t*)calloc(1, sizeof(mempool_t));
1217  return mp;
1218 }
1219 static void mp_destroy(mempool_t *mp)
1220 {
1221  int k;
1222  for (k = 0; k < mp->n; ++k) {
1223  free(mp->buf[k]->b.data);
1224  free(mp->buf[k]);
1225  }
1226  free(mp->buf);
1227  free(mp);
1228 }
1229 static inline lbnode_t *mp_alloc(mempool_t *mp)
1230 {
1231  ++mp->cnt;
1232  if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t));
1233  else return mp->buf[--mp->n];
1234 }
1235 static inline void mp_free(mempool_t *mp, lbnode_t *p)
1236 {
1237  --mp->cnt; p->next = 0; // clear lbnode_t::next here
1238  if (mp->n == mp->max) {
1239  mp->max = mp->max? mp->max<<1 : 256;
1240  mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max);
1241  }
1242  mp->buf[mp->n++] = p;
1243 }
1244 
1245 /**********************
1246  *** CIGAR resolver ***
1247  **********************/
1248 
1249 /* s->k: the index of the CIGAR operator that has just been processed.
1250  s->x: the reference coordinate of the start of s->k
1251  s->y: the query coordiante of the start of s->k
1252  */
1253 static inline int resolve_cigar2(bam_pileup1_t *p, int32_t pos, cstate_t *s)
1254 {
1255 #define _cop(c) ((c)&BAM_CIGAR_MASK)
1256 #define _cln(c) ((c)>>BAM_CIGAR_SHIFT)
1257 
1258  bam1_t *b = p->b;
1259  bam1_core_t *c = &b->core;
1260  uint32_t *cigar = bam_get_cigar(b);
1261  int k;
1262  // determine the current CIGAR operation
1263 // fprintf(stderr, "%s\tpos=%d\tend=%d\t(%d,%d,%d)\n", bam_get_qname(b), pos, s->end, s->k, s->x, s->y);
1264  if (s->k == -1) { // never processed
1265  if (c->n_cigar == 1) { // just one operation, save a loop
1266  if (_cop(cigar[0]) == BAM_CMATCH || _cop(cigar[0]) == BAM_CEQUAL || _cop(cigar[0]) == BAM_CDIFF) s->k = 0, s->x = c->pos, s->y = 0;
1267  } else { // find the first match or deletion
1268  for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) {
1269  int op = _cop(cigar[k]);
1270  int l = _cln(cigar[k]);
1271  if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CEQUAL || op == BAM_CDIFF) break;
1272  else if (op == BAM_CREF_SKIP) s->x += l;
1273  else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
1274  }
1275  assert(k < c->n_cigar);
1276  s->k = k;
1277  }
1278  } else { // the read has been processed before
1279  int op, l = _cln(cigar[s->k]);
1280  if (pos - s->x >= l) { // jump to the next operation
1281  assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case
1282  op = _cop(cigar[s->k+1]);
1283  if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { // jump to the next without a loop
1284  if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
1285  s->x += l;
1286  ++s->k;
1287  } else { // find the next M/D/N/=/X
1288  if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
1289  s->x += l;
1290  for (k = s->k + 1; k < c->n_cigar; ++k) {
1291  op = _cop(cigar[k]), l = _cln(cigar[k]);
1292  if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) break;
1293  else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
1294  }
1295  s->k = k;
1296  }
1297  assert(s->k < c->n_cigar); // otherwise a bug
1298  } // else, do nothing
1299  }
1300  { // collect pileup information
1301  int op, l;
1302  op = _cop(cigar[s->k]); l = _cln(cigar[s->k]);
1303  p->is_del = p->indel = p->is_refskip = 0;
1304  if (s->x + l - 1 == pos && s->k + 1 < c->n_cigar) { // peek the next operation
1305  int op2 = _cop(cigar[s->k+1]);
1306  int l2 = _cln(cigar[s->k+1]);
1307  if (op2 == BAM_CDEL) p->indel = -(int)l2;
1308  else if (op2 == BAM_CINS) p->indel = l2;
1309  else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) { // no working for adjacent padding
1310  int l3 = 0;
1311  for (k = s->k + 2; k < c->n_cigar; ++k) {
1312  op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
1313  if (op2 == BAM_CINS) l3 += l2;
1314  else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP || op2 == BAM_CEQUAL || op2 == BAM_CDIFF) break;
1315  }
1316  if (l3 > 0) p->indel = l3;
1317  }
1318  }
1319  if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
1320  p->qpos = s->y + (pos - s->x);
1321  } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
1322  p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!!
1323  p->is_refskip = (op == BAM_CREF_SKIP);
1324  } // cannot be other operations; otherwise a bug
1325  p->is_head = (pos == c->pos); p->is_tail = (pos == s->end);
1326  }
1327  return 1;
1328 }
1329 
1330 /***********************
1331  *** Pileup iterator ***
1332  ***********************/
1333 
1334 // Dictionary of overlapping reads
1335 KHASH_MAP_INIT_STR(olap_hash, lbnode_t *)
1336 typedef khash_t(olap_hash) olap_hash_t;
1337 
1338 struct __bam_plp_t {
1339  mempool_t *mp;
1340  lbnode_t *head, *tail, *dummy;
1341  int32_t tid, pos, max_tid, max_pos;
1342  int is_eof, max_plp, error, maxcnt;
1343  uint64_t id;
1344  bam_pileup1_t *plp;
1345  // for the "auto" interface only
1346  bam1_t *b;
1347  bam_plp_auto_f func;
1348  void *data;
1349  olap_hash_t *overlaps;
1350 };
1351 
1353 {
1354  bam_plp_t iter;
1355  iter = (bam_plp_t)calloc(1, sizeof(struct __bam_plp_t));
1356  iter->mp = mp_init();
1357  iter->head = iter->tail = mp_alloc(iter->mp);
1358  iter->dummy = mp_alloc(iter->mp);
1359  iter->max_tid = iter->max_pos = -1;
1360  iter->maxcnt = 8000;
1361  if (func) {
1362  iter->func = func;
1363  iter->data = data;
1364  iter->b = bam_init1();
1365  }
1366  return iter;
1367 }
1368 
1370 {
1371  iter->overlaps = kh_init(olap_hash); // hash for tweaking quality of bases in overlapping reads
1372 }
1373 
1375 {
1376  if ( iter->overlaps ) kh_destroy(olap_hash, iter->overlaps);
1377  mp_free(iter->mp, iter->dummy);
1378  mp_free(iter->mp, iter->head);
1379  if (iter->mp->cnt != 0)
1380  fprintf(stderr, "[bam_plp_destroy] memory leak: %d. Continue anyway.\n", iter->mp->cnt);
1381  mp_destroy(iter->mp);
1382  if (iter->b) bam_destroy1(iter->b);
1383  free(iter->plp);
1384  free(iter);
1385 }
1386 
1387 
1388 //---------------------------------
1389 //--- Tweak overlapping reads
1390 //---------------------------------
1391 
1403 static inline int cigar_iref2iseq_set(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, int *iref)
1404 {
1405  int pos = *iref;
1406  if ( pos < 0 ) return -1;
1407  *icig = 0;
1408  *iseq = 0;
1409  *iref = 0;
1410  while ( *cigar<cigar_max )
1411  {
1412  int cig = (**cigar) & BAM_CIGAR_MASK;
1413  int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
1414 
1415  if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
1416  if ( cig==BAM_CHARD_CLIP ) { (*cigar)++; *icig = 0; continue; }
1417  if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
1418  {
1419  pos -= ncig;
1420  if ( pos < 0 ) { *icig = ncig + pos; *iseq += *icig; *iref += *icig; return BAM_CMATCH; }
1421  (*cigar)++; *iseq += ncig; *icig = 0; *iref += ncig;
1422  continue;
1423  }
1424  if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
1425  if ( cig==BAM_CDEL )
1426  {
1427  pos -= ncig;
1428  if ( pos<0 ) pos = 0;
1429  (*cigar)++; *icig = 0; *iref += ncig;
1430  continue;
1431  }
1432  fprintf(stderr,"todo: cigar %d\n", cig);
1433  assert(0);
1434  }
1435  *iseq = -1;
1436  return -1;
1437 }
1438 static inline int cigar_iref2iseq_next(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, int *iref)
1439 {
1440  while ( *cigar < cigar_max )
1441  {
1442  int cig = (**cigar) & BAM_CIGAR_MASK;
1443  int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
1444 
1445  if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
1446  {
1447  if ( *icig >= ncig - 1 ) { *icig = 0; (*cigar)++; continue; }
1448  (*iseq)++; (*icig)++; (*iref)++;
1449  return BAM_CMATCH;
1450  }
1451  if ( cig==BAM_CDEL ) { (*cigar)++; (*iref) += ncig; *icig = 0; continue; }
1452  if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
1453  if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
1454  if ( cig==BAM_CHARD_CLIP ) { (*cigar)++; *icig = 0; continue; }
1455  fprintf(stderr,"todo: cigar %d\n", cig);
1456  assert(0);
1457  }
1458  *iseq = -1;
1459  *iref = -1;
1460  return -1;
1461 }
1462 
1463 static void tweak_overlap_quality(bam1_t *a, bam1_t *b)
1464 {
1465  uint32_t *a_cigar = bam_get_cigar(a), *a_cigar_max = a_cigar + a->core.n_cigar;
1466  uint32_t *b_cigar = bam_get_cigar(b), *b_cigar_max = b_cigar + b->core.n_cigar;
1467  int a_icig = 0, a_iseq = 0;
1468  int b_icig = 0, b_iseq = 0;
1469  uint8_t *a_qual = bam_get_qual(a), *b_qual = bam_get_qual(b);
1470  uint8_t *a_seq = bam_get_seq(a), *b_seq = bam_get_seq(b);
1471 
1472  int iref = b->core.pos;
1473  int a_iref = iref - a->core.pos;
1474  int b_iref = iref - b->core.pos;
1475  int a_ret = cigar_iref2iseq_set(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref);
1476  if ( a_ret<0 ) return; // no overlap
1477  int b_ret = cigar_iref2iseq_set(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref);
1478  if ( b_ret<0 ) return; // no overlap
1479 
1480  #if DBG
1481  fprintf(stderr,"tweak %s n_cigar=%d %d .. %d-%d vs %d-%d\n", bam_get_qname(a), a->core.n_cigar, b->core.n_cigar,
1483  #endif
1484 
1485  while ( 1 )
1486  {
1487  // Increment reference position
1488  while ( a_iref>=0 && a_iref < iref - a->core.pos )
1489  a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref);
1490  if ( a_ret<0 ) break; // done
1491  if ( iref < a_iref + a->core.pos ) iref = a_iref + a->core.pos;
1492 
1493  while ( b_iref>=0 && b_iref < iref - b->core.pos )
1494  b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref);
1495  if ( b_ret<0 ) break; // done
1496  if ( iref < b_iref + b->core.pos ) iref = b_iref + b->core.pos;
1497 
1498  iref++;
1499  if ( a_iref+a->core.pos != b_iref+b->core.pos ) continue; // only CMATCH positions, don't know what to do with indels
1500 
1501  if ( bam_seqi(a_seq,a_iseq) == bam_seqi(b_seq,b_iseq) )
1502  {
1503  #if DBG
1504  fprintf(stderr,"%c",seq_nt16_str[bam_seqi(a_seq,a_iseq)]);
1505  #endif
1506  // we are very confident about this base
1507  int qual = a_qual[a_iseq] + b_qual[b_iseq];
1508  a_qual[a_iseq] = qual>200 ? 200 : qual;
1509  b_qual[b_iseq] = 0;
1510  }
1511  else
1512  {
1513  if ( a_qual[a_iseq] >= b_qual[b_iseq] )
1514  {
1515  #if DBG
1516  fprintf(stderr,"[%c/%c]",seq_nt16_str[bam_seqi(a_seq,a_iseq)],tolower(seq_nt16_str[bam_seqi(b_seq,b_iseq)]));
1517  #endif
1518  a_qual[a_iseq] = 0.8 * a_qual[a_iseq]; // not so confident about a_qual anymore given the mismatch
1519  b_qual[b_iseq] = 0;
1520  }
1521  else
1522  {
1523  #if DBG
1524  fprintf(stderr,"[%c/%c]",tolower(seq_nt16_str[bam_seqi(a_seq,a_iseq)]),seq_nt16_str[bam_seqi(b_seq,b_iseq)]);
1525  #endif
1526  b_qual[b_iseq] = 0.8 * b_qual[b_iseq];
1527  a_qual[a_iseq] = 0;
1528  }
1529  }
1530  }
1531  #if DBG
1532  fprintf(stderr,"\n");
1533  #endif
1534 }
1535 
1536 // Fix overlapping reads. Simple soft-clipping did not give good results.
1537 // Lowering qualities of unwanted bases is more selective and works better.
1538 //
1539 static void overlap_push(bam_plp_t iter, lbnode_t *node)
1540 {
1541  if ( !iter->overlaps ) return;
1542 
1543  // mapped mates and paired reads only
1544  if ( node->b.core.flag&BAM_FMUNMAP || !(node->b.core.flag&BAM_FPROPER_PAIR) ) return;
1545 
1546  // no overlap possible, unless some wild cigar
1547  if ( abs(node->b.core.isize) >= 2*node->b.core.l_qseq ) return;
1548 
1549  khiter_t kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(&node->b));
1550  if ( kitr==kh_end(iter->overlaps) )
1551  {
1552  int ret;
1553  kitr = kh_put(olap_hash, iter->overlaps, bam_get_qname(&node->b), &ret);
1554  kh_value(iter->overlaps, kitr) = node;
1555  }
1556  else
1557  {
1558  lbnode_t *a = kh_value(iter->overlaps, kitr);
1559  tweak_overlap_quality(&a->b, &node->b);
1560  kh_del(olap_hash, iter->overlaps, kitr);
1561  assert(a->end-1 == a->s.end);
1562  a->end = a->b.core.pos + bam_cigar2rlen(a->b.core.n_cigar, bam_get_cigar(&a->b));
1563  a->s.end = a->end - 1;
1564  }
1565 }
1566 
1567 static void overlap_remove(bam_plp_t iter, const bam1_t *b)
1568 {
1569  if ( !iter->overlaps ) return;
1570 
1571  khiter_t kitr;
1572  if ( b )
1573  {
1574  kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(b));
1575  if ( kitr!=kh_end(iter->overlaps) )
1576  kh_del(olap_hash, iter->overlaps, kitr);
1577  }
1578  else
1579  {
1580  // remove all
1581  for (kitr = kh_begin(iter->overlaps); kitr<kh_end(iter->overlaps); kitr++)
1582  if ( kh_exist(iter->overlaps, kitr) ) kh_del(olap_hash, iter->overlaps, kitr);
1583  }
1584 }
1585 
1586 
1587 
1588 // Prepares next pileup position in bam records collected by bam_plp_auto -> user func -> bam_plp_push. Returns
1589 // pointer to the piled records if next position is ready or NULL if there is not enough records in the
1590 // buffer yet (the current position is still the maximum position across all buffered reads).
1591 const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
1592 {
1593  if (iter->error) { *_n_plp = -1; return 0; }
1594  *_n_plp = 0;
1595  if (iter->is_eof && iter->head->next == 0) return 0;
1596  while (iter->is_eof || iter->max_tid > iter->tid || (iter->max_tid == iter->tid && iter->max_pos > iter->pos)) {
1597  int n_plp = 0;
1598  lbnode_t *p, *q;
1599  // write iter->plp at iter->pos
1600  iter->dummy->next = iter->head;
1601  for (p = iter->head, q = iter->dummy; p->next; q = p, p = p->next) {
1602  if (p->b.core.tid < iter->tid || (p->b.core.tid == iter->tid && p->end <= iter->pos)) { // then remove
1603  overlap_remove(iter, &p->b);
1604  q->next = p->next; mp_free(iter->mp, p); p = q;
1605  } else if (p->b.core.tid == iter->tid && p->beg <= iter->pos) { // here: p->end > pos; then add to pileup
1606  if (n_plp == iter->max_plp) { // then double the capacity
1607  iter->max_plp = iter->max_plp? iter->max_plp<<1 : 256;
1608  iter->plp = (bam_pileup1_t*)realloc(iter->plp, sizeof(bam_pileup1_t) * iter->max_plp);
1609  }
1610  iter->plp[n_plp].b = &p->b;
1611  if (resolve_cigar2(iter->plp + n_plp, iter->pos, &p->s)) ++n_plp; // actually always true...
1612  }
1613  }
1614  iter->head = iter->dummy->next; // dummy->next may be changed
1615  *_n_plp = n_plp; *_tid = iter->tid; *_pos = iter->pos;
1616  // update iter->tid and iter->pos
1617  if (iter->head->next) {
1618  if (iter->tid > iter->head->b.core.tid) {
1619  fprintf(stderr, "[%s] unsorted input. Pileup aborts.\n", __func__);
1620  iter->error = 1;
1621  *_n_plp = -1;
1622  return 0;
1623  }
1624  }
1625  if (iter->tid < iter->head->b.core.tid) { // come to a new reference sequence
1626  iter->tid = iter->head->b.core.tid; iter->pos = iter->head->beg; // jump to the next reference
1627  } else if (iter->pos < iter->head->beg) { // here: tid == head->b.core.tid
1628  iter->pos = iter->head->beg; // jump to the next position
1629  } else ++iter->pos; // scan contiguously
1630  // return
1631  if (n_plp) return iter->plp;
1632  if (iter->is_eof && iter->head->next == 0) break;
1633  }
1634  return 0;
1635 }
1636 
1637 int bam_plp_push(bam_plp_t iter, const bam1_t *b)
1638 {
1639  if (iter->error) return -1;
1640  if (b) {
1641  if (b->core.tid < 0) { overlap_remove(iter, b); return 0; }
1642  // Skip only unmapped reads here, any additional filtering must be done in iter->func
1643  if (b->core.flag & BAM_FUNMAP) { overlap_remove(iter, b); return 0; }
1644  if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt)
1645  {
1646  overlap_remove(iter, b);
1647  return 0;
1648  }
1649  bam_copy1(&iter->tail->b, b);
1650  overlap_push(iter, iter->tail);
1651 #ifndef BAM_NO_ID
1652  iter->tail->b.id = iter->id++;
1653 #endif
1654  iter->tail->beg = b->core.pos;
1655  iter->tail->end = b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
1656  iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t
1657  if (b->core.tid < iter->max_tid) {
1658  fprintf(stderr, "[bam_pileup_core] the input is not sorted (chromosomes out of order)\n");
1659  iter->error = 1;
1660  return -1;
1661  }
1662  if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) {
1663  fprintf(stderr, "[bam_pileup_core] the input is not sorted (reads out of order)\n");
1664  iter->error = 1;
1665  return -1;
1666  }
1667  iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg;
1668  if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) {
1669  iter->tail->next = mp_alloc(iter->mp);
1670  iter->tail = iter->tail->next;
1671  }
1672  } else iter->is_eof = 1;
1673  return 0;
1674 }
1675 
1676 const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
1677 {
1678  const bam_pileup1_t *plp;
1679  if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; }
1680  if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
1681  else { // no pileup line can be obtained; read alignments
1682  *_n_plp = 0;
1683  if (iter->is_eof) return 0;
1684  int ret;
1685  while ( (ret=iter->func(iter->data, iter->b)) >= 0) {
1686  if (bam_plp_push(iter, iter->b) < 0) {
1687  *_n_plp = -1;
1688  return 0;
1689  }
1690  if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
1691  // otherwise no pileup line can be returned; read the next alignment.
1692  }
1693  if ( ret < -1 ) { iter->error = ret; *_n_plp = -1; return 0; }
1694  bam_plp_push(iter, 0);
1695  if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
1696  return 0;
1697  }
1698 }
1699 
1701 {
1702  lbnode_t *p, *q;
1703  iter->max_tid = iter->max_pos = -1;
1704  iter->tid = iter->pos = 0;
1705  iter->is_eof = 0;
1706  for (p = iter->head; p->next;) {
1707  overlap_remove(iter, NULL);
1708  q = p->next;
1709  mp_free(iter->mp, p);
1710  p = q;
1711  }
1712  iter->head = iter->tail;
1713 }
1714 
1715 void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt)
1716 {
1717  iter->maxcnt = maxcnt;
1718 }
1719 
1720 /************************
1721  *** Mpileup iterator ***
1722  ************************/
1723 
1725  int n;
1728  int *n_plp;
1730 };
1731 
1732 bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data)
1733 {
1734  int i;
1735  bam_mplp_t iter;
1736  iter = (bam_mplp_t)calloc(1, sizeof(struct __bam_mplp_t));
1737  iter->pos = (uint64_t*)calloc(n, 8);
1738  iter->n_plp = (int*)calloc(n, sizeof(int));
1739  iter->plp = (const bam_pileup1_t**)calloc(n, sizeof(bam_pileup1_t*));
1740  iter->iter = (bam_plp_t*)calloc(n, sizeof(bam_plp_t));
1741  iter->n = n;
1742  iter->min = (uint64_t)-1;
1743  for (i = 0; i < n; ++i) {
1744  iter->iter[i] = bam_plp_init(func, data[i]);
1745  iter->pos[i] = iter->min;
1746  }
1747  return iter;
1748 }
1749 
1751 {
1752  int i;
1753  for (i = 0; i < iter->n; ++i)
1754  bam_plp_init_overlaps(iter->iter[i]);
1755 }
1756 
1757 void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt)
1758 {
1759  int i;
1760  for (i = 0; i < iter->n; ++i)
1761  iter->iter[i]->maxcnt = maxcnt;
1762 }
1763 
1765 {
1766  int i;
1767  for (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]);
1768  free(iter->iter); free(iter->pos); free(iter->n_plp); free(iter->plp);
1769  free(iter);
1770 }
1771 
1772 int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp)
1773 {
1774  int i, ret = 0;
1775  uint64_t new_min = (uint64_t)-1;
1776  for (i = 0; i < iter->n; ++i) {
1777  if (iter->pos[i] == iter->min) {
1778  int tid, pos;
1779  iter->plp[i] = bam_plp_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]);
1780  if ( iter->iter[i]->error ) return -1;
1781  iter->pos[i] = iter->plp[i] ? (uint64_t)tid<<32 | pos : 0;
1782  }
1783  if (iter->plp[i] && iter->pos[i] < new_min) new_min = iter->pos[i];
1784  }
1785  iter->min = new_min;
1786  if (new_min == (uint64_t)-1) return 0;
1787  *_tid = new_min>>32; *_pos = (uint32_t)new_min;
1788  for (i = 0; i < iter->n; ++i) {
1789  if (iter->pos[i] == iter->min) { // FIXME: valgrind reports "uninitialised value(s) at this line"
1790  n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i];
1791  ++ret;
1792  } else n_plp[i] = 0, plp[i] = 0;
1793  }
1794  return ret;
1795 }
1796 
1797 #endif // ~!defined(BAM_NO_PILEUP)