NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
cram_stats.c
Go to the documentation of this file.
1 /*
2 Copyright (c) 2012-2013 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4 
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7 
8  1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10 
11  2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14 
15  3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18 
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30 
31 #ifdef HAVE_CONFIG_H
32 #include "io_lib_config.h"
33 #endif
34 
35 #include <stdio.h>
36 #include <errno.h>
37 #include <assert.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include <zlib.h>
41 #include <sys/types.h>
42 #include <sys/stat.h>
43 #include <math.h>
44 #include <ctype.h>
45 
46 #include "cram/cram.h"
47 #include "cram/os.h"
48 
50  return calloc(1, sizeof(cram_stats));
51 }
52 
54  st->nsamp++;
55 
56  //assert(val >= 0);
57 
58  if (val < MAX_STAT_VAL && val >= 0) {
59  st->freqs[val]++;
60  } else {
61  khint_t k;
62  int r;
63 
64  if (!st->h) {
65  st->h = kh_init(m_i2i);
66  }
67 
68  k = kh_put(m_i2i, st->h, val, &r);
69  if (r == 0)
70  kh_val(st->h, k)++;
71  else if (r != -1)
72  kh_val(st->h, k) = 1;
73  else
74  ; // FIXME: handle error
75  }
76 }
77 
79  st->nsamp--;
80 
81  //assert(val >= 0);
82 
83  if (val < MAX_STAT_VAL && val >= 0) {
84  st->freqs[val]--;
85  assert(st->freqs[val] >= 0);
86  } else if (st->h) {
87  khint_t k = kh_get(m_i2i, st->h, val);
88 
89  if (k != kh_end(st->h)) {
90  if (--kh_val(st->h, k) == 0)
91  kh_del(m_i2i, st->h, k);
92  } else {
93  fprintf(stderr, "Failed to remove val %d from cram_stats\n", val);
94  st->nsamp++;
95  }
96  } else {
97  fprintf(stderr, "Failed to remove val %d from cram_stats\n", val);
98  st->nsamp++;
99  }
100 }
101 
103  int i;
104  fprintf(stderr, "cram_stats:\n");
105  for (i = 0; i < MAX_STAT_VAL; i++) {
106  if (!st->freqs[i])
107  continue;
108  fprintf(stderr, "\t%d\t%d\n", i, st->freqs[i]);
109  }
110  if (st->h) {
111  khint_t k;
112  for (k = kh_begin(st->h); k != kh_end(st->h); k++) {
113  if (!kh_exist(st->h, k))
114  continue;
115 
116  fprintf(stderr, "\t%d\t%d\n", kh_key(st->h, k), kh_val(st->h, k));
117  }
118  }
119 }
120 
121 #if 1
122 /* Returns the number of bits set in val; it the highest bit used */
123 static int nbits(int v) {
124  static const int MultiplyDeBruijnBitPosition[32] = {
125  1, 10, 2, 11, 14, 22, 3, 30, 12, 15, 17, 19, 23, 26, 4, 31,
126  9, 13, 21, 29, 16, 18, 25, 8, 20, 28, 24, 7, 27, 6, 5, 32
127  };
128 
129  v |= v >> 1; // first up to set all bits 1 after the first 1 */
130  v |= v >> 2;
131  v |= v >> 4;
132  v |= v >> 8;
133  v |= v >> 16;
134 
135  // DeBruijn magic to find top bit
136  return MultiplyDeBruijnBitPosition[(uint32_t)(v * 0x07C4ACDDU) >> 27];
137 }
138 #endif
139 
140 /*
141  * Computes entropy from integer frequencies for various encoding methods and
142  * picks the best encoding.
143  *
144  * FIXME: we could reuse some of the code here for the actual encoding
145  * parameters too. Eg the best 'k' for SUBEXP or the code lengths for huffman.
146  *
147  * Returns the best codec to use.
148  */
150  enum cram_encoding best_encoding = E_NULL;
151  int best_size = INT_MAX, bits;
152  int nvals, i, ntot = 0, max_val = 0, min_val = INT_MAX, k;
153  int *vals = NULL, *freqs = NULL, vals_alloc = 0, *codes;
154 
155  //cram_stats_dump(st);
156 
157  /* Count number of unique symbols */
158  for (nvals = i = 0; i < MAX_STAT_VAL; i++) {
159  if (!st->freqs[i])
160  continue;
161  if (nvals >= vals_alloc) {
162  vals_alloc = vals_alloc ? vals_alloc*2 : 1024;
163  vals = realloc(vals, vals_alloc * sizeof(int));
164  freqs = realloc(freqs, vals_alloc * sizeof(int));
165  if (!vals || !freqs) {
166  if (vals) free(vals);
167  if (freqs) free(freqs);
168  return E_HUFFMAN; // Cannot do much else atm
169  }
170  }
171  vals[nvals] = i;
172  freqs[nvals] = st->freqs[i];
173  ntot += freqs[nvals];
174  if (max_val < i) max_val = i;
175  if (min_val > i) min_val = i;
176  nvals++;
177  }
178  if (st->h) {
179  khint_t k;
180  int i;
181 
182  for (k = kh_begin(st->h); k != kh_end(st->h); k++) {
183  if (!kh_exist(st->h, k))
184  continue;
185 
186  if (nvals >= vals_alloc) {
187  vals_alloc = vals_alloc ? vals_alloc*2 : 1024;
188  vals = realloc(vals, vals_alloc * sizeof(int));
189  freqs = realloc(freqs, vals_alloc * sizeof(int));
190  if (!vals || !freqs)
191  return E_HUFFMAN; // Cannot do much else atm
192  }
193  i = kh_key(st->h, k);
194  vals[nvals]=i;
195  freqs[nvals] = kh_val(st->h, k);
196  ntot += freqs[nvals];
197  if (max_val < i) max_val = i;
198  if (min_val > i) min_val = i;
199  nvals++;
200  }
201  }
202 
203  st->nvals = nvals;
204  assert(ntot == st->nsamp);
205 
206  if (nvals <= 1) {
207  free(vals);
208  free(freqs);
209  return E_HUFFMAN;
210  }
211 
212  /*
213  * Avoid complex stats for now, just do heuristic of HUFFMAN for small
214  * alphabets and BETA for anything large.
215  */
216  free(vals); free(freqs);
217  return nvals < 200 ? E_HUFFMAN : E_BETA;
218 
219  /* We only support huffman now anyway... */
220  //free(vals); free(freqs); return E_HUFFMAN;
221 
222  if (fd->verbose > 1)
223  fprintf(stderr, "Range = %d..%d, nvals=%d, ntot=%d\n",
224  min_val, max_val, nvals, ntot);
225 
226  /* Theoretical entropy */
227  {
228  double dbits = 0;
229  for (i = 0; i < nvals; i++) {
230  dbits += freqs[i] * log((double)freqs[i]/ntot);
231  }
232  dbits /= -log(2);
233  if (fd->verbose > 1)
234  fprintf(stderr, "Entropy = %f\n", dbits);
235  }
236 
237  /* Beta */
238  bits = nbits(max_val - min_val) * ntot;
239  if (fd->verbose > 1)
240  fprintf(stderr, "BETA = %d\n", bits);
241  if (best_size > bits)
242  best_size = bits, best_encoding = E_BETA;
243 
244 #if 0
245  /* Unary */
246  if (min_val >= 0) {
247  for (bits = i = 0; i < nvals; i++)
248  bits += freqs[i]*(vals[i]+1);
249  if (fd->verbose > 1)
250  fprintf(stderr, "UNARY = %d\n", bits);
251  if (best_size > bits)
252  best_size = bits, best_encoding = E_NULL; //E_UNARY;
253  }
254 
255  /* Gamma */
256  for (bits = i = 0; i < nvals; i++)
257  bits += ((nbits(vals[i]-min_val+1)-1) + nbits(vals[i]-min_val+1)) * freqs[i];
258  if (fd->verbose > 1)
259  fprintf(stderr, "GAMMA = %d\n", bits);
260  if (best_size > bits)
261  best_size = bits, best_encoding = E_GAMMA;
262 
263  /* Subexponential */
264  for (k = 0; k < 10; k++) {
265  for (bits = i = 0; i < nvals; i++) {
266  if (vals[i]-min_val < (1<<k))
267  bits += (1 + k)*freqs[i];
268  else
269  bits += (nbits(vals[i]-min_val)*2-k)*freqs[i];
270  }
271 
272  if (fd->verbose > 1)
273  fprintf(stderr, "SUBEXP%d = %d\n", k, bits);
274  if (best_size > bits)
275  best_size = bits, best_encoding = E_SUBEXP;
276  }
277 #endif
278 
279  /* byte array len */
280 
281  /* byte array stop */
282 
283  /* External? Guesswork! */
284 
285  /* Huffman */
286 // qsort(freqs, nvals, sizeof(freqs[0]), sort_freqs);
287 // for (i = 0; i < nvals; i++) {
288 // fprintf(stderr, "%d = %d\n", i, freqs[i]);
289 // vals[i] = 0;
290 // }
291 
292  /* Grow freqs to 2*freqs, to store sums */
293  /* Vals holds link data */
294  freqs = realloc(freqs, 2*nvals*sizeof(*freqs));
295  codes = calloc(2*nvals, sizeof(*codes));
296  if (!freqs || !codes)
297  return E_HUFFMAN; // Cannot do much else atm
298 
299  /* Inefficient, use pointers to form chain so we can insert and maintain
300  * a sorted list? This is currently O(nvals^2) complexity.
301  */
302  for (;;) {
303  int low1 = INT_MAX, low2 = INT_MAX;
304  int ind1 = 0, ind2 = 0;
305  for (i = 0; i < nvals; i++) {
306  if (freqs[i] < 0)
307  continue;
308  if (low1 > freqs[i])
309  low2 = low1, ind2 = ind1, low1 = freqs[i], ind1 = i;
310  else if (low2 > freqs[i])
311  low2 = freqs[i], ind2 = i;
312  }
313  if (low2 == INT_MAX)
314  break;
315 
316  //fprintf(stderr, "Merge ind %d (%d), %d (%d) = %d+%d, => %d=%d\n",
317  // ind1, vals[ind1], ind2, vals[ind2], low1, low2,
318  // nvals, low1+low2);
319 
320  freqs[nvals] = low1 + low2;
321  codes[ind1] = nvals;
322  codes[ind2] = nvals;
323  freqs[ind1] *= -1;
324  freqs[ind2] *= -1;
325  nvals++;
326  }
327  nvals = nvals/2+1;
328 
329  for (i = 0; i < nvals; i++) {
330  int code_len = 0;
331  for (k = codes[i]; k; k = codes[k])
332  code_len++;
333  codes[i] = code_len;
334  freqs[i] *= -1;
335  //fprintf(stderr, "%d / %d => %d\n", vals[i], freqs[i], codes[i]);
336  }
337 
338  for (bits = i = 0; i < nvals; i++) {
339  bits += freqs[i] * codes[i];
340  }
341  if (fd->verbose > 1)
342  fprintf(stderr, "HUFFMAN = %d\n", bits);
343  if (best_size >= bits)
344  best_size = bits, best_encoding = E_HUFFMAN;
345  free(codes);
346 
347  free(vals);
348  free(freqs);
349 
350  return best_encoding;
351 }
352 
354  if (st->h)
355  kh_destroy(m_i2i, st->h);
356  free(st);
357 }