NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
sam_header.c
Go to the documentation of this file.
1 /*
2 Copyright (c) 2013 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4 
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7 
8  1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10 
11  2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14 
15  3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18 
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30 
31 #ifdef HAVE_CONFIG_H
32 #include "io_lib_config.h"
33 #endif
34 
35 #include <string.h>
36 #include <assert.h>
37 
38 #include "cram/sam_header.h"
39 #include "cram/string_alloc.h"
40 
41 #ifdef SAMTOOLS
42 #define sam_hdr_parse sam_hdr_parse_
43 #endif
44 
45 static void sam_hdr_error(char *msg, char *line, int len, int lno) {
46  int j;
47 
48  for (j = 0; j < len && line[j] != '\n'; j++)
49  ;
50  fprintf(stderr, "%s at line %d: \"%.*s\"\n", msg, lno, j, line);
51 }
52 
53 void sam_hdr_dump(SAM_hdr *hdr) {
54  khint_t k;
55  int i;
56 
57  printf("===DUMP===\n");
58  for (k = kh_begin(hdr->h); k != kh_end(hdr->h); k++) {
59  SAM_hdr_type *t1, *t2;
60  char c[2];
61 
62  if (!kh_exist(hdr->h, k))
63  continue;
64 
65  t1 = t2 = kh_val(hdr->h, k);
66  c[0] = kh_key(hdr->h, k)>>8;
67  c[1] = kh_key(hdr->h, k)&0xff;
68  printf("Type %.2s, count %d\n", c, t1->prev->order+1);
69 
70  do {
71  SAM_hdr_tag *tag;
72  printf(">>>%d ", t1->order);
73  for (tag = t1->tag; tag; tag=tag->next) {
74  printf("\"%.2s\":\"%.*s\"\t",
75  tag->str, tag->len-3, tag->str+3);
76  }
77  putchar('\n');
78  t1 = t1->next;
79  } while (t1 != t2);
80  }
81 
82  /* Dump out PG chains */
83  printf("\n@PG chains:\n");
84  for (i = 0; i < hdr->npg_end; i++) {
85  int j;
86  printf(" %d:", i);
87  for (j = hdr->pg_end[i]; j != -1; j = hdr->pg[j].prev_id) {
88  printf("%s%d(%.*s)",
89  j == hdr->pg_end[i] ? " " : "->",
90  j, hdr->pg[j].name_len, hdr->pg[j].name);
91  }
92  printf("\n");
93  }
94 
95  puts("===END DUMP===");
96 }
97 
98 /* Updates the hash tables in the SAM_hdr structure.
99  *
100  * Returns 0 on success;
101  * -1 on failure
102  */
103 static int sam_hdr_update_hashes(SAM_hdr *sh,
104  int type,
105  SAM_hdr_type *h_type) {
106  /* Add to reference hash? */
107  if ((type>>8) == 'S' && (type&0xff) == 'Q') {
108  SAM_hdr_tag *tag;
109  int nref = sh->nref;
110 
111  sh->ref = realloc(sh->ref, (sh->nref+1)*sizeof(*sh->ref));
112  if (!sh->ref)
113  return -1;
114 
115  tag = h_type->tag;
116  sh->ref[nref].name = NULL;
117  sh->ref[nref].len = 0;
118  sh->ref[nref].ty = h_type;
119  sh->ref[nref].tag = tag;
120 
121  while (tag) {
122  if (tag->str[0] == 'S' && tag->str[1] == 'N') {
123  if (!(sh->ref[nref].name = malloc(tag->len)))
124  return -1;
125  strncpy(sh->ref[nref].name, tag->str+3, tag->len-3);
126  sh->ref[nref].name[tag->len-3] = 0;
127  } else if (tag->str[0] == 'L' && tag->str[1] == 'N') {
128  sh->ref[nref].len = atoi(tag->str+3);
129  }
130  tag = tag->next;
131  }
132 
133  if (sh->ref[nref].name) {
134  khint_t k;
135  int r;
136  k = kh_put(m_s2i, sh->ref_hash, sh->ref[nref].name, &r);
137  if (-1 == r) return -1;
138  kh_val(sh->ref_hash, k) = nref;
139  }
140 
141  sh->nref++;
142  }
143 
144  /* Add to read-group hash? */
145  if ((type>>8) == 'R' && (type&0xff) == 'G') {
146  SAM_hdr_tag *tag;
147  int nrg = sh->nrg;
148 
149  sh->rg = realloc(sh->rg, (sh->nrg+1)*sizeof(*sh->rg));
150  if (!sh->rg)
151  return -1;
152 
153  tag = h_type->tag;
154  sh->rg[nrg].name = NULL;
155  sh->rg[nrg].name_len = 0;
156  sh->rg[nrg].ty = h_type;
157  sh->rg[nrg].tag = tag;
158  sh->rg[nrg].id = nrg;
159 
160  while (tag) {
161  if (tag->str[0] == 'I' && tag->str[1] == 'D') {
162  if (!(sh->rg[nrg].name = malloc(tag->len)))
163  return -1;
164  strncpy(sh->rg[nrg].name, tag->str+3, tag->len-3);
165  sh->rg[nrg].name[tag->len-3] = 0;
166  sh->rg[nrg].name_len = strlen(sh->rg[nrg].name);
167  }
168  tag = tag->next;
169  }
170 
171  if (sh->rg[nrg].name) {
172  khint_t k;
173  int r;
174  k = kh_put(m_s2i, sh->rg_hash, sh->rg[nrg].name, &r);
175  if (-1 == r) return -1;
176  kh_val(sh->rg_hash, k) = nrg;
177  }
178 
179  sh->nrg++;
180  }
181 
182  /* Add to program hash? */
183  if ((type>>8) == 'P' && (type&0xff) == 'G') {
184  SAM_hdr_tag *tag;
185  int npg = sh->npg;
186 
187  sh->pg = realloc(sh->pg, (sh->npg+1)*sizeof(*sh->pg));
188  if (!sh->pg)
189  return -1;
190 
191  tag = h_type->tag;
192  sh->pg[npg].name = NULL;
193  sh->pg[npg].name_len = 0;
194  sh->pg[npg].ty = h_type;
195  sh->pg[npg].tag = tag;
196  sh->pg[npg].id = npg;
197  sh->pg[npg].prev_id = -1;
198 
199  while (tag) {
200  if (tag->str[0] == 'I' && tag->str[1] == 'D') {
201  if (!(sh->pg[npg].name = malloc(tag->len)))
202  return -1;
203  strncpy(sh->pg[npg].name, tag->str+3, tag->len-3);
204  sh->pg[npg].name[tag->len-3] = 0;
205  sh->pg[npg].name_len = strlen(sh->pg[npg].name);
206  } else if (tag->str[0] == 'P' && tag->str[1] == 'P') {
207  // Resolve later if needed
208  khint_t k;
209  char tmp = tag->str[tag->len]; tag->str[tag->len] = 0;
210  k = kh_get(m_s2i, sh->pg_hash, tag->str+3);
211  tag->str[tag->len] = tmp;
212 
213  if (k != kh_end(sh->pg_hash)) {
214  int p_id = kh_val(sh->pg_hash, k);
215  sh->pg[npg].prev_id = sh->pg[p_id].id;
216 
217  /* Unmark previous entry as a PG termination */
218  if (sh->npg_end > 0 &&
219  sh->pg_end[sh->npg_end-1] == p_id) {
220  sh->npg_end--;
221  } else {
222  int i;
223  for (i = 0; i < sh->npg_end; i++) {
224  if (sh->pg_end[i] == p_id) {
225  memmove(&sh->pg_end[i], &sh->pg_end[i+1],
226  (sh->npg_end-i-1)*sizeof(*sh->pg_end));
227  sh->npg_end--;
228  }
229  }
230  }
231  } else {
232  sh->pg[npg].prev_id = -1;
233  }
234  }
235  tag = tag->next;
236  }
237 
238  if (sh->pg[npg].name) {
239  khint_t k;
240  int r;
241  k = kh_put(m_s2i, sh->pg_hash, sh->pg[npg].name, &r);
242  if (-1 == r) return -1;
243  kh_val(sh->pg_hash, k) = npg;
244  }
245 
246  /* Add to npg_end[] array. Remove later if we find a PP line */
247  if (sh->npg_end >= sh->npg_end_alloc) {
248  sh->npg_end_alloc = sh->npg_end_alloc
249  ? sh->npg_end_alloc*2
250  : 4;
251  sh->pg_end = realloc(sh->pg_end,
252  sh->npg_end_alloc * sizeof(int));
253  if (!sh->pg_end)
254  return -1;
255  }
256  sh->pg_end[sh->npg_end++] = npg;
257 
258  sh->npg++;
259  }
260 
261  return 0;
262 }
263 
264 /*
265  * Appends a formatted line to an existing SAM header.
266  * Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with
267  * optional new-line. If it contains more than 1 line then multiple lines
268  * will be added in order.
269  *
270  * Len is the length of the text data, or 0 if unknown (in which case
271  * it should be null terminated).
272  *
273  * Returns 0 on success
274  * -1 on failure
275  */
276 int sam_hdr_add_lines(SAM_hdr *sh, const char *lines, int len) {
277  int i, lno = 1, text_offset;
278  char *hdr;
279 
280  if (!len)
281  len = strlen(lines);
282 
283  text_offset = ks_len(&sh->text);
284  if (EOF == kputsn(lines, len, &sh->text))
285  return -1;
286  hdr = ks_str(&sh->text) + text_offset;
287 
288  for (i = 0; i < len; i++) {
289  khint32_t type;
290  khint_t k;
291 
292  int l_start = i, new;
293  SAM_hdr_type *h_type;
294  SAM_hdr_tag *h_tag, *last;
295 
296  if (hdr[i] != '@') {
297  int j;
298  for (j = i; j < len && hdr[j] != '\n'; j++)
299  ;
300  sam_hdr_error("Header line does not start with '@'",
301  &hdr[l_start], len - l_start, lno);
302  return -1;
303  }
304 
305  type = (hdr[i+1]<<8) | hdr[i+2];
306  if (hdr[i+1] < 'A' || hdr[i+1] > 'z' ||
307  hdr[i+2] < 'A' || hdr[i+2] > 'z') {
308  sam_hdr_error("Header line does not have a two character key",
309  &hdr[l_start], len - l_start, lno);
310  return -1;
311  }
312 
313  i += 3;
314  if (hdr[i] == '\n')
315  continue;
316 
317  // Add the header line type
318  if (!(h_type = pool_alloc(sh->type_pool)))
319  return -1;
320  if (-1 == (k = kh_put(sam_hdr, sh->h, type, &new)))
321  return -1;
322 
323  // Form the ring, either with self or other lines of this type
324  if (!new) {
325  SAM_hdr_type *t = kh_val(sh->h, k), *p;
326  p = t->prev;
327 
328  assert(p->next = t);
329  p->next = h_type;
330  h_type->prev = p;
331 
332  t->prev = h_type;
333  h_type->next = t;
334  h_type->order = p->order+1;
335  } else {
336  kh_val(sh->h, k) = h_type;
337  h_type->prev = h_type->next = h_type;
338  h_type->order = 0;
339  }
340 
341  // Parse the tags on this line
342  last = NULL;
343  if ((type>>8) == 'C' && (type&0xff) == 'O') {
344  int j;
345  if (hdr[i] != '\t') {
346  sam_hdr_error("Missing tab",
347  &hdr[l_start], len - l_start, lno);
348  return -1;
349  }
350 
351  for (j = ++i; j < len && hdr[j] != '\n'; j++)
352  ;
353 
354  if (!(h_type->tag = h_tag = pool_alloc(sh->tag_pool)))
355  return -1;
356  h_tag->str = string_ndup(sh->str_pool, &hdr[i], j-i);
357  h_tag->len = j-i;
358  h_tag->next = NULL;
359  if (!h_tag->str)
360  return -1;
361 
362  i = j;
363 
364  } else {
365  do {
366  int j;
367  if (hdr[i] != '\t') {
368  sam_hdr_error("Missing tab",
369  &hdr[l_start], len - l_start, lno);
370  return -1;
371  }
372 
373  for (j = ++i; j < len && hdr[j] != '\n' && hdr[j] != '\t'; j++)
374  ;
375 
376  if (!(h_tag = pool_alloc(sh->tag_pool)))
377  return -1;
378  h_tag->str = string_ndup(sh->str_pool, &hdr[i], j-i);
379  h_tag->len = j-i;
380  h_tag->next = NULL;
381  if (!h_tag->str)
382  return -1;
383 
384  if (h_tag->len < 3 || h_tag->str[2] != ':') {
385  sam_hdr_error("Malformed key:value pair",
386  &hdr[l_start], len - l_start, lno);
387  return -1;
388  }
389 
390  if (last)
391  last->next = h_tag;
392  else
393  h_type->tag = h_tag;
394 
395  last = h_tag;
396  i = j;
397  } while (i < len && hdr[i] != '\n');
398  }
399 
400  /* Update RG/SQ hashes */
401  if (-1 == sam_hdr_update_hashes(sh, type, h_type))
402  return -1;
403  }
404 
405  return 0;
406 }
407 
408 /*
409  * Adds a single line to a SAM header.
410  * Specify type and one or more key,value pairs, ending with the NULL key.
411  * Eg. sam_hdr_add(h, "SQ", "ID", "foo", "LN", "100", NULL).
412  *
413  * Returns index for specific entry on success (eg 2nd SQ, 4th RG)
414  * -1 on failure
415  */
416 int sam_hdr_add(SAM_hdr *sh, const char *type, ...) {
417  va_list args;
418  va_start(args, type);
419  return sam_hdr_vadd(sh, type, args, NULL);
420 }
421 
422 int sam_hdr_vadd(SAM_hdr *sh, const char *type, va_list ap, ...) {
423  va_list args;
424  SAM_hdr_type *h_type;
425  SAM_hdr_tag *h_tag, *last;
426  int new;
427  khint32_t type_i = (type[0]<<8) | type[1], k;
428 
429 #if defined(HAVE_VA_COPY)
430  va_list ap_local;
431 #endif
432 
433  if (EOF == kputc_('@', &sh->text))
434  return -1;
435  if (EOF == kputsn(type, 2, &sh->text))
436  return -1;
437 
438  if (!(h_type = pool_alloc(sh->type_pool)))
439  return -1;
440  if (-1 == (k = kh_put(sam_hdr, sh->h, type_i, &new)))
441  return -1;
442  kh_val(sh->h, k) = h_type;
443 
444  // Form the ring, either with self or other lines of this type
445  if (!new) {
446  SAM_hdr_type *t = kh_val(sh->h, k), *p;
447  p = t->prev;
448 
449  assert(p->next = t);
450  p->next = h_type;
451  h_type->prev = p;
452 
453  t->prev = h_type;
454  h_type->next = t;
455  h_type->order = p->order + 1;
456  } else {
457  h_type->prev = h_type->next = h_type;
458  h_type->order = 0;
459  }
460 
461  last = NULL;
462 
463  // Any ... varargs
464  va_start(args, ap);
465  for (;;) {
466  char *k, *v;
467  int idx;
468 
469  if (!(k = (char *)va_arg(args, char *)))
470  break;
471  v = va_arg(args, char *);
472 
473  if (EOF == kputc_('\t', &sh->text))
474  return -1;
475 
476  if (!(h_tag = pool_alloc(sh->tag_pool)))
477  return -1;
478  idx = ks_len(&sh->text);
479 
480  if (EOF == kputs(k, &sh->text))
481  return -1;
482  if (EOF == kputc_(':', &sh->text))
483  return -1;
484  if (EOF == kputs(v, &sh->text))
485  return -1;
486 
487  h_tag->len = ks_len(&sh->text) - idx;
488  h_tag->str = string_ndup(sh->str_pool,
489  ks_str(&sh->text) + idx,
490  h_tag->len);
491  h_tag->next = NULL;
492  if (!h_tag->str)
493  return -1;
494 
495  if (last)
496  last->next = h_tag;
497  else
498  h_type->tag = h_tag;
499 
500  last = h_tag;
501  }
502  va_end(args);
503 
504 #if defined(HAVE_VA_COPY)
505  va_copy(ap_local, ap);
506 # define ap ap_local
507 #endif
508 
509  // Plus the specified va_list params
510  for (;;) {
511  char *k, *v;
512  int idx;
513 
514  if (!(k = (char *)va_arg(ap, char *)))
515  break;
516  v = va_arg(ap, char *);
517 
518  if (EOF == kputc_('\t', &sh->text))
519  return -1;
520 
521  if (!(h_tag = pool_alloc(sh->tag_pool)))
522  return -1;
523  idx = ks_len(&sh->text);
524 
525  if (EOF == kputs(k, &sh->text))
526  return -1;
527  if (EOF == kputc_(':', &sh->text))
528  return -1;
529  if (EOF == kputs(v, &sh->text))
530  return -1;
531 
532  h_tag->len = ks_len(&sh->text) - idx;
533  h_tag->str = string_ndup(sh->str_pool,
534  ks_str(&sh->text) + idx,
535  h_tag->len);
536  h_tag->next = NULL;
537  if (!h_tag->str)
538  return -1;
539 
540  if (last)
541  last->next = h_tag;
542  else
543  h_type->tag = h_tag;
544 
545  last = h_tag;
546  }
547  va_end(ap);
548 
549  if (EOF == kputc('\n', &sh->text))
550  return -1;
551 
552  int itype = (type[0]<<8) | type[1];
553  if (-1 == sam_hdr_update_hashes(sh, itype, h_type))
554  return -1;
555 
556  return h_type->order;
557 }
558 
559 /*
560  * Returns the first header item matching 'type'. If ID is non-NULL it checks
561  * for the tag ID: and compares against the specified ID.
562  *
563  * Returns NULL if no type/ID is found
564  */
566  char *ID_key, char *ID_value) {
567  SAM_hdr_type *t1, *t2;
568  int itype = (type[0]<<8)|(type[1]);
569  khint_t k;
570 
571  /* Special case for types we have prebuilt hashes on */
572  if (ID_key) {
573  if (type[0] == 'S' && type[1] == 'Q' &&
574  ID_key[0] == 'S' && ID_key[1] == 'N') {
575  k = kh_get(m_s2i, hdr->ref_hash, ID_value);
576  return k != kh_end(hdr->ref_hash)
577  ? hdr->ref[kh_val(hdr->ref_hash, k)].ty
578  : NULL;
579  }
580 
581  if (type[0] == 'R' && type[1] == 'G' &&
582  ID_key[0] == 'I' && ID_key[1] == 'D') {
583  k = kh_get(m_s2i, hdr->rg_hash, ID_value);
584  return k != kh_end(hdr->rg_hash)
585  ? hdr->rg[kh_val(hdr->rg_hash, k)].ty
586  : NULL;
587  }
588 
589  if (type[0] == 'P' && type[1] == 'G' &&
590  ID_key[0] == 'I' && ID_key[1] == 'D') {
591  k = kh_get(m_s2i, hdr->pg_hash, ID_value);
592  return k != kh_end(hdr->pg_hash)
593  ? hdr->pg[kh_val(hdr->pg_hash, k)].ty
594  : NULL;
595  }
596  }
597 
598  k = kh_get(sam_hdr, hdr->h, itype);
599  if (k == kh_end(hdr->h))
600  return NULL;
601 
602  if (!ID_key)
603  return kh_val(hdr->h, k);
604 
605  t1 = t2 = kh_val(hdr->h, k);
606  do {
607  SAM_hdr_tag *tag;
608  for (tag = t1->tag; tag; tag = tag->next) {
609  if (tag->str[0] == ID_key[0] && tag->str[1] == ID_key[1]) {
610  char *cp1 = tag->str+3;
611  char *cp2 = ID_value;
612  while (*cp1 && *cp1 == *cp2)
613  cp1++, cp2++;
614  if (*cp2 || *cp1)
615  continue;
616  return t1;
617  }
618  }
619  t1 = t1->next;
620  } while (t1 != t2);
621 
622  return NULL;
623 }
624 
625 /*
626  * As per SAM_hdr_type, but returns a complete line of formatted text
627  * for a specific head type/ID combination. If ID is NULL then it returns
628  * the first line of the specified type.
629  *
630  * The returned string is malloced and should be freed by the calling
631  * function with free().
632  *
633  * Returns NULL if no type/ID is found.
634  */
635 char *sam_hdr_find_line(SAM_hdr *hdr, char *type,
636  char *ID_key, char *ID_value) {
637  SAM_hdr_type *ty = sam_hdr_find(hdr, type, ID_key, ID_value);
639  SAM_hdr_tag *tag;
640  int r = 0;
641 
642  if (!ty)
643  return NULL;
644 
645  // Paste together the line from the hashed copy
646  r |= (kputc_('@', &ks) == EOF);
647  r |= (kputs(type, &ks) == EOF);
648  for (tag = ty->tag; tag; tag = tag->next) {
649  r |= (kputc_('\t', &ks) == EOF);
650  r |= (kputsn(tag->str, tag->len, &ks) == EOF);
651  }
652 
653  if (r) {
654  KS_FREE(&ks);
655  return NULL;
656  }
657 
658  return ks_str(&ks);
659 }
660 
661 
662 /*
663  * Looks for a specific key in a single sam header line.
664  * If prev is non-NULL it also fills this out with the previous tag, to
665  * permit use in key removal. *prev is set to NULL when the tag is the first
666  * key in the list. When a tag isn't found, prev (if non NULL) will be the last
667  * tag in the existing list.
668  *
669  * Returns the tag pointer on success
670  * NULL on failure
671  */
673  SAM_hdr_type *type,
674  char *key,
675  SAM_hdr_tag **prev) {
676  SAM_hdr_tag *tag, *p = NULL;
677 
678  for (tag = type->tag; tag; p = tag, tag = tag->next) {
679  if (tag->str[0] == key[0] && tag->str[1] == key[1]) {
680  if (prev)
681  *prev = p;
682  return tag;
683  }
684  }
685 
686  if (prev)
687  *prev = p;
688 
689  return NULL;
690 }
691 
692 
693 /*
694  * Adds or updates tag key,value pairs in a header line.
695  * Eg for adding M5 tags to @SQ lines or updating sort order for the
696  * @HD line (although use the sam_hdr_sort_order() function for
697  * HD manipulation, which is a wrapper around this funuction).
698  *
699  * Specify multiple key,value pairs ending in NULL.
700  *
701  * Returns 0 on success
702  * -1 on failure
703  */
704 int sam_hdr_update(SAM_hdr *hdr, SAM_hdr_type *type, ...) {
705  va_list ap;
706 
707  va_start(ap, type);
708 
709  for (;;) {
710  char *k, *v;
711  int idx;
712  SAM_hdr_tag *tag, *prev;
713 
714  if (!(k = (char *)va_arg(ap, char *)))
715  break;
716  v = va_arg(ap, char *);
717 
718  tag = sam_hdr_find_key(hdr, type, k, &prev);
719  if (!tag) {
720  if (!(tag = pool_alloc(hdr->tag_pool)))
721  return -1;
722  if (prev)
723  prev->next = tag;
724  else
725  type->tag = tag;
726 
727  tag->next = NULL;
728  }
729 
730  idx = ks_len(&hdr->text);
731  if (ksprintf(&hdr->text, "%2.2s:%s", k, v) < 0)
732  return -1;
733  tag->len = ks_len(&hdr->text) - idx;
734  tag->str = string_ndup(hdr->str_pool,
735  ks_str(&hdr->text) + idx,
736  tag->len);
737  if (!tag->str)
738  return -1;
739  }
740 
741  va_end(ap);
742 
743  return 0;
744 }
745 
746 #define K(a) (((a)[0]<<8)|((a)[1]))
747 
748 /*
749  * Reconstructs the kstring from the header hash table.
750  * Returns 0 on success
751  * -1 on failure
752  */
754  /* Order: HD then others */
756  khint_t k;
757 
758 
759  k = kh_get(sam_hdr, hdr->h, K("HD"));
760  if (k != kh_end(hdr->h)) {
761  SAM_hdr_type *ty = kh_val(hdr->h, k);
762  SAM_hdr_tag *tag;
763  if (EOF == kputs("@HD", &ks))
764  return -1;
765  for (tag = ty->tag; tag; tag = tag->next) {
766  if (EOF == kputc_('\t', &ks))
767  return -1;
768  if (EOF == kputsn_(tag->str, tag->len, &ks))
769  return -1;
770  }
771  if (EOF == kputc('\n', &ks))
772  return -1;
773  }
774 
775  for (k = kh_begin(hdr->h); k != kh_end(hdr->h); k++) {
776  SAM_hdr_type *t1, *t2;
777 
778  if (!kh_exist(hdr->h, k))
779  continue;
780 
781  if (kh_key(hdr->h, k) == K("HD"))
782  continue;
783 
784  t1 = t2 = kh_val(hdr->h, k);
785  do {
786  SAM_hdr_tag *tag;
787  char c[2];
788 
789  if (EOF == kputc_('@', &ks))
790  return -1;
791  c[0] = kh_key(hdr->h, k)>>8;
792  c[1] = kh_key(hdr->h, k)&0xff;
793  if (EOF == kputsn_(c, 2, &ks))
794  return -1;
795  for (tag = t1->tag; tag; tag=tag->next) {
796  if (EOF == kputc_('\t', &ks))
797  return -1;
798  if (EOF == kputsn_(tag->str, tag->len, &ks))
799  return -1;
800  }
801  if (EOF == kputc('\n', &ks))
802  return -1;
803  t1 = t1->next;
804  } while (t1 != t2);
805  }
806 
807  if (ks_str(&hdr->text))
808  KS_FREE(&hdr->text);
809 
810  hdr->text = ks;
811 
812  return 0;
813 }
814 
815 
816 /*
817  * Creates an empty SAM header, ready to be populated.
818  *
819  * Returns a SAM_hdr struct on success (free with sam_hdr_free())
820  * NULL on failure
821  */
823  SAM_hdr *sh = calloc(1, sizeof(*sh));
824 
825  if (!sh)
826  return NULL;
827 
828  sh->h = kh_init(sam_hdr);
829  if (!sh->h)
830  goto err;
831 
832  sh->ID_cnt = 1;
833  sh->ref_count = 1;
834 
835  sh->nref = 0;
836  sh->ref = NULL;
837  if (!(sh->ref_hash = kh_init(m_s2i)))
838  goto err;
839 
840  sh->nrg = 0;
841  sh->rg = NULL;
842  if (!(sh->rg_hash = kh_init(m_s2i)))
843  goto err;
844 
845  sh->npg = 0;
846  sh->pg = NULL;
847  sh->npg_end = sh->npg_end_alloc = 0;
848  sh->pg_end = NULL;
849  if (!(sh->pg_hash = kh_init(m_s2i)))
850  goto err;
851 
852  KS_INIT(&sh->text);
853 
854  if (!(sh->tag_pool = pool_create(sizeof(SAM_hdr_tag))))
855  goto err;
856 
857  if (!(sh->type_pool = pool_create(sizeof(SAM_hdr_type))))
858  goto err;
859 
860  if (!(sh->str_pool = string_pool_create(8192)))
861  goto err;
862 
863  return sh;
864 
865  err:
866  if (sh->h)
867  kh_destroy(sam_hdr, sh->h);
868 
869  if (sh->tag_pool)
870  pool_destroy(sh->tag_pool);
871 
872  if (sh->type_pool)
873  pool_destroy(sh->type_pool);
874 
875  if (sh->str_pool)
877 
878  free(sh);
879 
880  return NULL;
881 }
882 
883 
884 /*
885  * Tokenises a SAM header into a hash table.
886  * Also extracts a few bits on specific data types, such as @RG lines.
887  *
888  * Returns a SAM_hdr struct on success (free with sam_hdr_free())
889  * NULL on failure
890  */
891 SAM_hdr *sam_hdr_parse(const char *hdr, int len) {
892  /* Make an empty SAM_hdr */
893  SAM_hdr *sh;
894 
895  sh = sam_hdr_new();
896  if (NULL == sh) return NULL;
897 
898  if (NULL == hdr) return sh; // empty header is permitted
899 
900  /* Parse the header, line by line */
901  if (-1 == sam_hdr_add_lines(sh, hdr, len)) {
902  sam_hdr_free(sh);
903  return NULL;
904  }
905 
906  //sam_hdr_dump(sh);
907  //sam_hdr_add(sh, "RG", "ID", "foo", "SM", "bar", NULL);
908  //sam_hdr_rebuild(sh);
909  //printf(">>%s<<", ks_str(sh->text));
910 
911  //parse_references(sh);
912  //parse_read_groups(sh);
913 
914  sam_hdr_link_pg(sh);
915  //sam_hdr_dump(sh);
916 
917  return sh;
918 }
919 
920 /*
921  * Produces a duplicate copy of hdr and returns it.
922  * Returns NULL on failure
923  */
925  if (-1 == sam_hdr_rebuild(hdr))
926  return NULL;
927 
928  return sam_hdr_parse(sam_hdr_str(hdr), sam_hdr_length(hdr));
929 }
930 
937  hdr->ref_count++;
938 }
939 
949  sam_hdr_free(hdr);
950 }
951 
960 void sam_hdr_free(SAM_hdr *hdr) {
961  if (!hdr)
962  return;
963 
964  if (--hdr->ref_count > 0)
965  return;
966 
967  if (ks_str(&hdr->text))
968  KS_FREE(&hdr->text);
969 
970  if (hdr->h)
971  kh_destroy(sam_hdr, hdr->h);
972 
973  if (hdr->ref_hash)
974  kh_destroy(m_s2i, hdr->ref_hash);
975 
976  if (hdr->ref) {
977  int i;
978  for (i = 0; i < hdr->nref; i++)
979  if (hdr->ref[i].name)
980  free(hdr->ref[i].name);
981  free(hdr->ref);
982  }
983 
984  if (hdr->rg_hash)
985  kh_destroy(m_s2i, hdr->rg_hash);
986 
987  if (hdr->rg) {
988  int i;
989  for (i = 0; i < hdr->nrg; i++)
990  if (hdr->rg[i].name)
991  free(hdr->rg[i].name);
992  free(hdr->rg);
993  }
994 
995  if (hdr->pg_hash)
996  kh_destroy(m_s2i, hdr->pg_hash);
997 
998  if (hdr->pg) {
999  int i;
1000  for (i = 0; i < hdr->npg; i++)
1001  if (hdr->pg[i].name)
1002  free(hdr->pg[i].name);
1003  free(hdr->pg);
1004  }
1005 
1006  if (hdr->pg_end)
1007  free(hdr->pg_end);
1008 
1009  if (hdr->type_pool)
1010  pool_destroy(hdr->type_pool);
1011 
1012  if (hdr->tag_pool)
1013  pool_destroy(hdr->tag_pool);
1014 
1015  if (hdr->str_pool)
1017 
1018  free(hdr);
1019 }
1020 
1022  return ks_len(&hdr->text);
1023 }
1024 
1025 char *sam_hdr_str(SAM_hdr *hdr) {
1026  return ks_str(&hdr->text);
1027 }
1028 
1029 /*
1030  * Looks up a reference sequence by name and returns the numerical ID.
1031  * Returns -1 if unknown reference.
1032  */
1033 int sam_hdr_name2ref(SAM_hdr *hdr, const char *ref) {
1034  khint_t k = kh_get(m_s2i, hdr->ref_hash, ref);
1035  return k == kh_end(hdr->ref_hash) ? -1 : kh_val(hdr->ref_hash, k);
1036 }
1037 
1038 /*
1039  * Looks up a read-group by name and returns a pointer to the start of the
1040  * associated tag list.
1041  *
1042  * Returns NULL on failure
1043  */
1044 SAM_RG *sam_hdr_find_rg(SAM_hdr *hdr, const char *rg) {
1045  khint_t k = kh_get(m_s2i, hdr->rg_hash, rg);
1046  return k == kh_end(hdr->rg_hash)
1047  ? NULL
1048  : &hdr->rg[kh_val(hdr->rg_hash, k)];
1049 }
1050 
1051 
1052 /*
1053  * Fixes any PP links in @PG headers.
1054  * If the entries are in order then this doesn't need doing, but incase
1055  * our header is out of order this goes through the sh->pg[] array
1056  * setting the prev_id field.
1057  *
1058  * Note we can have multiple complete chains. This code should identify the
1059  * tails of these chains as these are the entries we have to link to in
1060  * subsequent PP records.
1061  *
1062  * Returns 0 on sucess
1063  * -1 on failure (indicating broken PG/PP records)
1064  */
1066  int i, j, ret = 0;
1067 
1068  hdr->npg_end_alloc = hdr->npg;
1069  hdr->pg_end = realloc(hdr->pg_end, hdr->npg * sizeof(*hdr->pg_end));
1070  if (!hdr->pg_end)
1071  return -1;
1072 
1073  for (i = 0; i < hdr->npg; i++)
1074  hdr->pg_end[i] = i;
1075 
1076  for (i = 0; i < hdr->npg; i++) {
1077  khint_t k;
1078  SAM_hdr_tag *tag;
1079  char tmp;
1080 
1081  for (tag = hdr->pg[i].tag; tag; tag = tag->next) {
1082  if (tag->str[0] == 'P' && tag->str[1] == 'P')
1083  break;
1084  }
1085  if (!tag) {
1086  /* Chain start points */
1087  continue;
1088  }
1089 
1090  tmp = tag->str[tag->len]; tag->str[tag->len] = 0;
1091  k = kh_get(m_s2i, hdr->pg_hash, tag->str+3);
1092  tag->str[tag->len] = tmp;
1093 
1094  if (k == kh_end(hdr->pg_hash)) {
1095  ret = -1;
1096  continue;
1097  }
1098 
1099  hdr->pg[i].prev_id = hdr->pg[kh_val(hdr->pg_hash, k)].id;
1100  hdr->pg_end[kh_val(hdr->pg_hash, k)] = -1;
1101  }
1102 
1103  for (i = j = 0; i < hdr->npg; i++) {
1104  if (hdr->pg_end[i] != -1)
1105  hdr->pg_end[j++] = hdr->pg_end[i];
1106  }
1107  hdr->npg_end = j;
1108 
1109  return ret;
1110 }
1111 
1112 /*
1113  * Returns a unique ID from a base name.
1114  *
1115  * The value returned is valid until the next call to
1116  * this function.
1117  */
1118 const char *sam_hdr_PG_ID(SAM_hdr *sh, const char *name) {
1119  khint_t k = kh_get(m_s2i, sh->pg_hash, name);
1120  if (k == kh_end(sh->pg_hash))
1121  return name;
1122 
1123  do {
1124  sprintf(sh->ID_buf, "%.1000s.%d", name, sh->ID_cnt++);
1125  k = kh_get(m_s2i, sh->pg_hash, sh->ID_buf);
1126  } while (k == kh_end(sh->pg_hash));
1127 
1128  return sh->ID_buf;
1129 }
1130 
1131 /*
1132  * Add an @PG line.
1133  *
1134  * If we wish complete control over this use sam_hdr_add() directly. This
1135  * function uses that, but attempts to do a lot of tedious house work for
1136  * you too.
1137  *
1138  * - It will generate a suitable ID if the supplied one clashes.
1139  * - It will generate multiple @PG records if we have multiple PG chains.
1140  *
1141  * Call it as per sam_hdr_add() with a series of key,value pairs ending
1142  * in NULL.
1143  *
1144  * Returns 0 on success
1145  * -1 on failure
1146  */
1147 int sam_hdr_add_PG(SAM_hdr *sh, const char *name, ...) {
1148  va_list args;
1149  va_start(args, name);
1150 
1151  if (sh->npg_end) {
1152  /* Copy ends array to avoid us looping while modifying it */
1153  int *end = malloc(sh->npg_end * sizeof(int));
1154  int i, nends = sh->npg_end;
1155 
1156  if (!end)
1157  return -1;
1158 
1159  memcpy(end, sh->pg_end, nends * sizeof(*end));
1160 
1161  for (i = 0; i < nends; i++) {
1162  if (-1 == sam_hdr_vadd(sh, "PG", args,
1163  "ID", sam_hdr_PG_ID(sh, name),
1164  "PN", name,
1165  "PP", sh->pg[end[i]].name,
1166  NULL)) {
1167  free(end);
1168  return -1;
1169  }
1170  }
1171 
1172  free(end);
1173  } else {
1174  if (-1 == sam_hdr_vadd(sh, "PG", args,
1175  "ID", sam_hdr_PG_ID(sh, name),
1176  "PN", name,
1177  NULL))
1178  return -1;
1179  }
1180 
1181  //sam_hdr_dump(sh);
1182 
1183  return 0;
1184 }
1185 
1186 /*
1187  * A function to help with construction of CL tags in @PG records.
1188  * Takes an argc, argv pair and returns a single space-separated string.
1189  * This string should be deallocated by the calling function.
1190  *
1191  * Returns malloced char * on success
1192  * NULL on failure
1193  */
1194 char *stringify_argv(int argc, char *argv[]) {
1195  char *str, *cp;
1196  size_t nbytes = 1;
1197  int i, j;
1198 
1199  /* Allocate */
1200  for (i = 0; i < argc; i++) {
1201  nbytes += strlen(argv[i]) + 1;
1202  }
1203  if (!(str = malloc(nbytes)))
1204  return NULL;
1205 
1206  /* Copy */
1207  cp = str;
1208  for (i = 0; i < argc; i++) {
1209  j = 0;
1210  while (argv[i][j]) {
1211  if (argv[i][j] == '\t')
1212  *cp++ = ' ';
1213  else
1214  *cp++ = argv[i][j];
1215  j++;
1216  }
1217  *cp++ = ' ';
1218  }
1219  *cp++ = 0;
1220 
1221  return str;
1222 }