NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
cram_structs.h
Go to the documentation of this file.
1 /*
2 Copyright (c) 2012-2013 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4 
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7 
8  1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10 
11  2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14 
15  3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18 
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30 
31 #ifndef _CRAM_STRUCTS_H_
32 #define _CRAM_STRUCTS_H_
33 
34 #ifdef __cplusplus
35 extern "C" {
36 #endif
37 
38 /*
39  * Defines in-memory structs for the basic file-format objects in the
40  * CRAM format.
41  *
42  * The basic file format is:
43  * File-def SAM-hdr Container Container ...
44  *
45  * Container:
46  * Service-block data-block data-block ...
47  *
48  * Multiple blocks in a container are grouped together as slices,
49  * also sometimes referred to as landmarks in the spec.
50  */
51 
52 
53 #include <stdint.h>
54 
55 #include "cram/thread_pool.h"
56 
57 #ifdef SAMTOOLS
58 // From within samtools/HTSlib
59 # include "cram/string_alloc.h"
60 # include "htslib/khash.h"
61 
62 // Generic hash-map integer -> integer
63 KHASH_MAP_INIT_INT(m_i2i, int)
64 
65 // Generic hash-set integer -> (existance)
66 KHASH_SET_INIT_INT(s_i2i)
67 
68 // For brevity
69 typedef unsigned char uc;
70 
71 /*
72  * A union for the preservation map. Required for khash.
73  */
74 typedef union {
75  int i;
76  char *p;
77 } pmap_t;
78 
79 // Generates static functions here which isn't ideal, but we have no way
80 // currently to declare the kh_map_t structure here without also declaring a
81 // duplicate in the .c files due to the nature of the KHASH macros.
82 KHASH_MAP_INIT_STR(map, pmap_t)
83 
84 struct hFILE;
85 typedef struct hFILE cram_FILE;
86 
87 #else
88 // From within io_lib
89 # include "cram/bam.h" // For BAM header parsing
90 typedef FILE cram_FILE;
91 #endif
92 
93 #define SEQS_PER_SLICE 10000
94 #define SLICE_PER_CNT 1
95 
96 #define CRAM_SUBST_MATRIX "CGTNAGTNACTNACGNACGT"
97 
98 #define TN_external
99 //#define NS_external
100 #define TS_external
101 //#define BA_external
102 
103 #define MAX_STAT_VAL 1024
104 //#define MAX_STAT_VAL 16
105 typedef struct {
106  int freqs[MAX_STAT_VAL];
107  khash_t(m_i2i) *h;
108  int nsamp; // total number of values added
109  int nvals; // total number of unique values added
110 } cram_stats;
111 
112 /* NB: matches java impl, not the spec */
114  E_NULL = 0,
116  E_GOLOMB = 2,
120  E_BETA = 6,
121  E_SUBEXP = 7,
124 };
125 
127  E_INT = 1,
128  E_LONG = 2,
129  E_BYTE = 3,
132 };
133 
134 /* "File Definition Structure" */
135 typedef struct {
136  char magic[4];
139  char file_id[20]; // Filename or SHA1 checksum
140 } cram_file_def;
141 
142 #define CRAM_1_VERS 100 // 1.0
143 #define CRAM_2_VERS 200 // 1.1, or 2.0?
144 
145 struct cram_slice;
146 
148  BM_ERROR = -1,
149  RAW = 0,
150  GZIP = 1,
151  BZIP2 = 2,
152 };
153 
155  CT_ERROR = -1,
159  UNMAPPED_SLICE = 3, // CRAM_1_VERS only
160  EXTERNAL = 4,
161  CORE = 5,
162 };
163 
164 /* Compression metrics */
165 typedef struct {
166  int m1;
167  int m2;
168  int trial;
170 } cram_metrics;
171 
172 /* Block */
173 typedef struct {
174  enum cram_block_method method, orig_method;
175  enum cram_content_type content_type;
179  int32_t idx; /* offset into data */
180  unsigned char *data;
181 
182  // For bit I/O
183  size_t alloc;
184  size_t byte;
185  int bit;
186 } cram_block;
187 
188 struct cram_codec; /* defined in cram_codecs.h */
189 struct cram_map;
190 
191 #define CRAM_MAP_HASH 32
192 #define CRAM_MAP(a,b) (((a)*3+(b))&(CRAM_MAP_HASH-1))
193 
194 /* Compression header block */
195 typedef struct {
202 
203  /* Flags from preservation map */
209  int AP_delta;
210  // indexed by ref-base and subst. code
211  char substitution_matrix[5][4];
212 
213  // TD Dictionary as a concatenated block
214  cram_block *TD_blk; // Tag Dictionary
215  int nTL; // number of TL entries in TD
216  unsigned char **TL; // array of size nTL, pointer into TD_blk.
217  khash_t(m_s2i) *TD_hash; // Keyed on TD strings, map to TL[] indices
218  string_alloc_t *TD_keys; // Pooled keys for TD hash.
219 
220  khash_t(map) *preservation_map;
221  struct cram_map *rec_encoding_map[CRAM_MAP_HASH];
222  struct cram_map *tag_encoding_map[CRAM_MAP_HASH];
223 
224  struct cram_codec *BF_codec; // bam bit flags
225  struct cram_codec *CF_codec; // compression flags
226  struct cram_codec *RL_codec; // read length
227  struct cram_codec *AP_codec; // alignment pos
228  struct cram_codec *RG_codec; // read group
229  struct cram_codec *MF_codec; // mate flags
230  struct cram_codec *NS_codec; // next frag ref ID
231  struct cram_codec *NP_codec; // next frag pos
232  struct cram_codec *TS_codec; // template size
233  struct cram_codec *NF_codec; // next frag distance
234  struct cram_codec *TC_codec; // tag count CRAM_1_VERS
235  struct cram_codec *TN_codec; // tag name/type CRAM_1_VERS
236  struct cram_codec *TL_codec; // tag line CRAM_2_VERS
237  struct cram_codec *FN_codec; // no. features
238  struct cram_codec *FC_codec; // feature code
239  struct cram_codec *FP_codec; // feature pos
240  struct cram_codec *BS_codec; // base subst feature
241  struct cram_codec *IN_codec; // insertion feature
242  struct cram_codec *SC_codec; // soft-clip feature
243  struct cram_codec *DL_codec; // deletion len feature
244  struct cram_codec *BA_codec; // base feature
245  struct cram_codec *RS_codec; // ref skip length feature
246  struct cram_codec *PD_codec; // padding length feature
247  struct cram_codec *HC_codec; // hard clip length feature
248  struct cram_codec *MQ_codec; // mapping quality
249  struct cram_codec *RN_codec; // read names
250  struct cram_codec *QS_codec; // quality value (single)
251  struct cram_codec *Qs_codec; // quality values (string)
252  struct cram_codec *RI_codec; // ref ID
253  struct cram_codec *TM_codec; // ?
254  struct cram_codec *TV_codec; // ?
255 
256  char *uncomp; // A single block of uncompressed data
257  size_t uncomp_size, uncomp_alloc;
259 
260 typedef struct cram_map {
261  int key; /* 0xe0 + 3 bytes */
263  int offset; /* Offset into a single block of memory */
264  int size; /* Size */
265  struct cram_codec *codec;
266  struct cram_map *next; // for noddy internal hash
267 } cram_map;
268 
269 /* Mapped or unmapped slice header block */
270 typedef struct {
271  enum cram_content_type content_type;
272  int32_t ref_seq_id; /* if content_type == MAPPED_SLICE */
273  int32_t ref_seq_start; /* if content_type == MAPPED_SLICE */
274  int32_t ref_seq_span; /* if content_type == MAPPED_SLICE */
280  int32_t ref_base_id; /* if content_type == MAPPED_SLICE */
281  unsigned char md5[16];
283 
284 struct ref_entry;
285 
286 /*
287  * Container.
288  *
289  * Conceptually a container is split into slices, and slices into blocks.
290  * However on disk it's just a list of blocks and we need to query the
291  * block types to identify the start/end points of the slices.
292  *
293  * OR... are landmarks the start/end points of slices?
294  */
295 typedef struct {
306 
307  /* Size of container header above */
308  size_t offset;
309 
310  /* Compression header is always the first block? */
313 
314  /* For construction purposes */
315  int max_slice, curr_slice; // maximum number of slices
316  int max_rec, curr_rec; // current and max recs per slice
317  int max_c_rec, curr_c_rec; // current and max recs per container
318  int slice_rec; // rec no. for start of this slice
319  int curr_ref; // current ref ID. -2 for no previous
320  int last_pos; // last record position
321  struct cram_slice **slices, *slice;
322  int pos_sorted; // boolean, 1=>position sorted data
323  int max_apos; // maximum position, used if pos_sorted==0
324  int last_slice; // number of reads in last slice (0 for 1st)
325  int multi_seq; // true if packing multi seqs per cont/slice
326  int unsorted; // true is AP_delta is 0.
327 
328  /* Copied from fd before encoding, to allow multi-threading */
329  int ref_start, first_base, last_base, ref_id, ref_end;
330  char *ref;
331  //struct ref_entry *ref;
332 
333  /* For multi-threading */
335 
336  /* Statistics for encoding */
365 
366  khash_t(s_i2i) *tags_used; // set of tag types in use, for tag encoding map
367  int *refs_used; // array of frequency of ref seq IDs
369 
370 /*
371  * A single cram record
372  */
373 typedef struct {
374  struct cram_slice *s; // Filled out by cram_decode only
375 
376  int32_t ref_id; // fixed for all recs in slice?
377  int32_t flags; // BF
379  int32_t len; // RL
380  int32_t apos; // AP
381  int32_t rg; // RG
382  int32_t name; // RN; idx to s->names_blk
384  int32_t mate_line; // index to another cram_record
387  int32_t tlen; // TS
388 
389  // Auxiliary data
390  int32_t ntags; // TC
391  int32_t aux; // idx to s->aux_blk
392  int32_t aux_size; // total size of packed ntags in aux_blk
393 #ifndef TN_external
394  int32_t TN_idx; // TN; idx to s->TN;
395 #else
396  int32_t tn; // idx to s->tn_blk
397 #endif
398  int TL;
399 
400  int32_t seq; // idx to s->seqs_blk
401  int32_t qual; // idx to s->qual_blk
402  int32_t cigar; // idx to s->cigar
404  int32_t aend; // alignment end
405  int32_t mqual; // MQ
406 
407  int32_t feature; // idx to s->feature
408  int32_t nfeature; // number of features
410 } cram_record;
411 
412 // Accessor macros as an analogue of the bam ones
413 #define cram_qname(c) (&(c)->s->name_blk->data[(c)->name])
414 #define cram_seq(c) (&(c)->s->seqs_blk->data[(c)->seq])
415 #define cram_qual(c) (&(c)->s->qual_blk->data[(c)->qual])
416 #define cram_aux(c) (&(c)->s->aux_blk->data[(c)->aux])
417 #define cram_seqi(c,i) (cram_seq((c))[(i)])
418 #define cram_name_len(c) ((c)->name_len)
419 #define cram_strand(c) (((c)->flags & BAM_FREVERSE) != 0)
420 #define cram_mstrand(c) (((c)->flags & BAM_FMREVERSE) != 0)
421 #define cram_cigar(c) (&((cr)->s->cigar)[(c)->cigar])
422 
423 /*
424  * A feature is a base difference, used for the sequence reference encoding.
425  * (We generate these internally when writing CRAM.)
426  */
427 typedef struct {
428  union {
429  struct {
430  int pos;
431  int code;
432  int base; // substitution code
433  } X;
434  struct {
435  int pos;
436  int code;
437  int base; // actual base & qual
438  int qual;
439  } B;
440  struct {
441  int pos;
442  int code;
443  int qual;
444  } Q;
445  struct {
446  int pos;
447  int code;
448  int len;
449  int seq_idx; // soft-clip multiple bases
450  } S;
451  struct {
452  int pos;
453  int code;
454  int len;
455  int seq_idx; // insertion multiple bases
456  } I;
457  struct {
458  int pos;
459  int code;
460  int base; // insertion single base
461  } i;
462  struct {
463  int pos;
464  int code;
465  int len;
466  } D;
467  struct {
468  int pos;
469  int code;
470  int len;
471  } N;
472  struct {
473  int pos;
474  int code;
475  int len;
476  } P;
477  struct {
478  int pos;
479  int code;
480  int len;
481  } H;
482  };
483 } cram_feature;
484 
485 /*
486  * A slice is really just a set of blocks, but it
487  * is the logical unit for decoding a number of
488  * sequences.
489  */
490 typedef struct cram_slice {
495 
496  /* State used during encoding/decoding */
498 
499  /* Identifier used for auto-assigning read names */
501 
502  /* Array of decoded cram records */
504 
505  /* An dynamically growing buffers for data pointed
506  * to by crecs[] array.
507  */
515  cram_block *base_blk; // substitutions (soft-clips for 1.0)
516  cram_block *soft_blk; // soft-clips
517 
520  int afeatures; // allocated size of features
521 
522 #ifndef TN_external
523  // TN field (Tag Name)
524  uint32_t *TN;
525  int nTN, aTN; // used and allocated size for TN[]
526 #else
528  int tn_id;
529 #endif
530 
531  string_alloc_t *pair_keys; // Pooled keys for pair hash.
532  khash_t(m_s2i) *pair; // for identifying read-pairs in this slice.
533 
534  char *ref; // slice of current reference
535  int ref_start; // start position of current reference;
536  int ref_end; // end position of current reference;
537 
538 #ifdef BA_external
539  int BA_len;
540  int ba_id;
541 #endif
542  int ref_id;
543 } cram_slice;
544 
545 /*-----------------------------------------------------------------------------
546  * Consider moving reference handling to cram_refs.[ch]
547  */
548 // from fa.fai / samtools faidx files
549 typedef struct ref_entry {
550  char *name;
551  char *fn;
556  int64_t count; // for shared references so we know to dealloc seq
557  char *seq;
558 } ref_entry;
559 
561 
562 // References structure.
563 typedef struct {
564  string_alloc_t *pool; // String pool for holding filenames and SN vals
565 
566  khash_t(refs) *h_meta; // ref_entry*, index by name
567  ref_entry **ref_id; // ref_entry*, index by ID
568  int nref; // number of ref_entry
569 
570  char *fn; // current file opened
571  FILE *fp; // and the FILE* to go with it.
572 
573  int count; // how many cram_fd sharing this refs struct
574 
575  pthread_mutex_t lock; // Mutex for multi-threaded updating
576  ref_entry *last; // Last queried sequence
577  int last_id; // Used in cram_ref_decr_locked to delay free
578 } refs_t;
579 
580 /*-----------------------------------------------------------------------------
581  * CRAM index
582  *
583  * Detect format by number of entries per line.
584  * 5 => 1.0 (refid, start, nseq, C offset, slice)
585  * 6 => 1.1 (refid, start, span, C offset, S offset, S size)
586  *
587  * Indices are stored in a nested containment list, which is trivial to set
588  * up as the indices are on sorted data so we're appending to the nclist
589  * in sorted order. Basically if a slice entirely fits within a previous
590  * slice then we append to that slices list. This is done recursively.
591  *
592  * Lists are sorted on two dimensions: ref id + slice coords.
593  */
594 typedef struct cram_index {
595  int nslice, nalloc; // total number of slices
596  struct cram_index *e; // array of size nslice
597 
598  int refid; // 1.0 1.1
599  int start; // 1.0 1.1
600  int end; // 1.1
601  int nseq; // 1.0 - undocumented
602  int slice; // 1.0 landmark index, 1.1 landmark value
603  int len; // 1.1 - size of slice in bytes
604  int64_t offset; // 1.0 1.1
605 } cram_index;
606 
607 typedef struct {
608  int refid;
609  int start;
610  int end;
611 } cram_range;
612 
613 /*-----------------------------------------------------------------------------
614  */
615 /* CRAM File handle */
616 
617 typedef struct spare_bams {
619  struct spare_bams *next;
620 } spare_bams;
621 
622 typedef struct cram_fd {
623  cram_FILE *fp;
624  int mode; // 'r' or 'w'
625  int version;
628 
629  char *prefix;
632  int err;
633 
634  // Most recent compression header decoded
635  //cram_block_compression_hdr *comp_hdr;
636  //cram_block_slice_hdr *slice_hdr;
637 
638  // Current container being processed.
640 
641  // positions for encoding or decoding
643 
644  // cached reference portion
645  refs_t *refs; // ref meta-data structure
646  char *ref, *ref_free; // current portion held in memory
647  int ref_id;
649  int ref_end;
650  char *ref_fn; // reference fasta filename
651 
652  // compression level and metrics
653  int level;
655 
656  // options
657  int decode_md; // Whether to export MD and NM tags
658  int verbose;
662  int no_ref;
664  int use_bz2;
667 
668  // lookup tables, stored here so we can be trivially multi-threaded
669  unsigned int bam_flag_swap[0x1000]; // cram -> bam flags
670  unsigned int cram_flag_swap[0x1000];// bam -> cram flags
671  unsigned char L1[256]; // ACGT{*} ->0123{4}
672  unsigned char L2[256]; // ACGTN{*}->01234{5}
673  char cram_sub_matrix[32][32]; // base substituion codes
674 
675  int index_sz;
676  cram_index *index; // array, sizeof index_sz
678  int eof;
679  int last_slice; // number of recs encoded in last slice
681  int unsorted;
682  int empty_container; // Marker for EOF block
683 
684  // thread pool
685  int own_pool;
688  pthread_mutex_t metrics_lock;
689  pthread_mutex_t ref_lock;
691  pthread_mutex_t bam_list_lock;
692  void *job_pending;
693  int ooc; // out of containers.
694 } cram_fd;
695 
713 };
714 
715 /* BF bitfields */
716 /* Corrected in 1.1. Use bam_flag_swap[bf] and BAM_* macros for 1.0 & 1.1 */
717 #define CRAM_FPAIRED 256
718 #define CRAM_FPROPER_PAIR 128
719 #define CRAM_FUNMAP 64
720 #define CRAM_FREVERSE 32
721 #define CRAM_FREAD1 16
722 #define CRAM_FREAD2 8
723 #define CRAM_FSECONDARY 4
724 #define CRAM_FQCFAIL 2
725 #define CRAM_FDUP 1
726 
727 #define CRAM_M_REVERSE 1
728 #define CRAM_M_UNMAP 2
729 
730 
731 /* CF bitfields */
732 #define CRAM_FLAG_PRESERVE_QUAL_SCORES (1<<0)
733 #define CRAM_FLAG_DETACHED (1<<1)
734 #define CRAM_FLAG_MATE_DOWNSTREAM (1<<2)
735 
736 /* External IDs used by this implementation (only assumed during writing) */
737 #define CRAM_EXT_IN 0
738 #define CRAM_EXT_QUAL 1
739 #define CRAM_EXT_NAME 2
740 #define CRAM_EXT_TS_NP 3
741 #define CRAM_EXT_TAG 4
742 #define CRAM_EXT_TAG_S "\004"
743 #define CRAM_EXT_BA 5
744 #define CRAM_EXT_TN 6
745 #define CRAM_EXT_SC 7
746 #define CRAM_EXT_REF 8
747 
748 #ifdef __cplusplus
749 }
750 #endif
751 
752 #endif /* _CRAM_STRUCTS_H_ */