NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
kstring.c
Go to the documentation of this file.
1 #include <stdarg.h>
2 #include <stdio.h>
3 #include <ctype.h>
4 #include <string.h>
5 #include <stdint.h>
6 #include "htslib/kstring.h"
7 
8 int kvsprintf(kstring_t *s, const char *fmt, va_list ap)
9 {
10  va_list args;
11  int l;
12  va_copy(args, ap);
13  l = vsnprintf(s->s + s->l, s->m - s->l, fmt, args); // This line does not work with glibc 2.0. See `man snprintf'.
14  va_end(args);
15  if (l + 1 > s->m - s->l) {
16  s->m = s->l + l + 2;
17  kroundup32(s->m);
18  s->s = (char*)realloc(s->s, s->m);
19  va_copy(args, ap);
20  l = vsnprintf(s->s + s->l, s->m - s->l, fmt, args);
21  va_end(args);
22  }
23  s->l += l;
24  return l;
25 }
26 
27 int ksprintf(kstring_t *s, const char *fmt, ...)
28 {
29  va_list ap;
30  int l;
31  va_start(ap, fmt);
32  l = kvsprintf(s, fmt, ap);
33  va_end(ap);
34  return l;
35 }
36 
37 char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux)
38 {
39  const char *p, *start;
40  if (sep) { // set up the table
41  if (str == 0 && (aux->tab[0]&1)) return 0; // no need to set up if we have finished
42  aux->finished = 0;
43  if (sep[1]) {
44  aux->sep = -1;
45  aux->tab[0] = aux->tab[1] = aux->tab[2] = aux->tab[3] = 0;
46  for (p = sep; *p; ++p) aux->tab[*p>>6] |= 1ull<<(*p&0x3f);
47  } else aux->sep = sep[0];
48  }
49  if (aux->finished) return 0;
50  else if (str) aux->p = str - 1, aux->finished = 0;
51  if (aux->sep < 0) {
52  for (p = start = aux->p + 1; *p; ++p)
53  if (aux->tab[*p>>6]>>(*p&0x3f)&1) break;
54  } else {
55  for (p = start = aux->p + 1; *p; ++p)
56  if (*p == aux->sep) break;
57  }
58  aux->p = p; // end of token
59  if (*p == 0) aux->finished = 1; // no more tokens
60  return (char*)start;
61 }
62 
63 // s MUST BE a null terminated string; l = strlen(s)
64 int ksplit_core(char *s, int delimiter, int *_max, int **_offsets)
65 {
66  int i, n, max, last_char, last_start, *offsets, l;
67  n = 0; max = *_max; offsets = *_offsets;
68  l = strlen(s);
69 
70 #define __ksplit_aux do { \
71  if (_offsets) { \
72  s[i] = 0; \
73  if (n == max) { \
74  int *tmp; \
75  max = max? max<<1 : 2; \
76  if ((tmp = (int*)realloc(offsets, sizeof(int) * max))) { \
77  offsets = tmp; \
78  } else { \
79  free(offsets); \
80  *_offsets = NULL; \
81  return 0; \
82  } \
83  } \
84  offsets[n++] = last_start; \
85  } else ++n; \
86  } while (0)
87 
88  for (i = 0, last_char = last_start = 0; i <= l; ++i) {
89  if (delimiter == 0) {
90  if (isspace(s[i]) || s[i] == 0) {
91  if (isgraph(last_char)) __ksplit_aux; // the end of a field
92  } else {
93  if (isspace(last_char) || last_char == 0) last_start = i;
94  }
95  } else {
96  if (s[i] == delimiter || s[i] == 0) {
97  if (last_char != 0 && last_char != delimiter) __ksplit_aux; // the end of a field
98  } else {
99  if (last_char == delimiter || last_char == 0) last_start = i;
100  }
101  }
102  last_char = s[i];
103  }
104  *_max = max; *_offsets = offsets;
105  return n;
106 }
107 
108 /**********************
109  * Boyer-Moore search *
110  **********************/
111 
112 typedef unsigned char ubyte_t;
113 
114 // reference: http://www-igm.univ-mlv.fr/~lecroq/string/node14.html
115 static int *ksBM_prep(const ubyte_t *pat, int m)
116 {
117  int i, *suff, *prep, *bmGs, *bmBc;
118  prep = (int*)calloc(m + 256, sizeof(int));
119  bmGs = prep; bmBc = prep + m;
120  { // preBmBc()
121  for (i = 0; i < 256; ++i) bmBc[i] = m;
122  for (i = 0; i < m - 1; ++i) bmBc[pat[i]] = m - i - 1;
123  }
124  suff = (int*)calloc(m, sizeof(int));
125  { // suffixes()
126  int f = 0, g;
127  suff[m - 1] = m;
128  g = m - 1;
129  for (i = m - 2; i >= 0; --i) {
130  if (i > g && suff[i + m - 1 - f] < i - g)
131  suff[i] = suff[i + m - 1 - f];
132  else {
133  if (i < g) g = i;
134  f = i;
135  while (g >= 0 && pat[g] == pat[g + m - 1 - f]) --g;
136  suff[i] = f - g;
137  }
138  }
139  }
140  { // preBmGs()
141  int j = 0;
142  for (i = 0; i < m; ++i) bmGs[i] = m;
143  for (i = m - 1; i >= 0; --i)
144  if (suff[i] == i + 1)
145  for (; j < m - 1 - i; ++j)
146  if (bmGs[j] == m)
147  bmGs[j] = m - 1 - i;
148  for (i = 0; i <= m - 2; ++i)
149  bmGs[m - 1 - suff[i]] = m - 1 - i;
150  }
151  free(suff);
152  return prep;
153 }
154 
155 void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep)
156 {
157  int i, j, *prep = 0, *bmGs, *bmBc;
158  const ubyte_t *str, *pat;
159  str = (const ubyte_t*)_str; pat = (const ubyte_t*)_pat;
160  prep = (_prep == 0 || *_prep == 0)? ksBM_prep(pat, m) : *_prep;
161  if (_prep && *_prep == 0) *_prep = prep;
162  bmGs = prep; bmBc = prep + m;
163  j = 0;
164  while (j <= n - m) {
165  for (i = m - 1; i >= 0 && pat[i] == str[i+j]; --i);
166  if (i >= 0) {
167  int max = bmBc[str[i+j]] - m + 1 + i;
168  if (max < bmGs[i]) max = bmGs[i];
169  j += max;
170  } else return (void*)(str + j);
171  }
172  if (_prep == 0) free(prep);
173  return 0;
174 }
175 
176 char *kstrstr(const char *str, const char *pat, int **_prep)
177 {
178  return (char*)kmemmem(str, strlen(str), pat, strlen(pat), _prep);
179 }
180 
181 char *kstrnstr(const char *str, const char *pat, int n, int **_prep)
182 {
183  return (char*)kmemmem(str, n, pat, strlen(pat), _prep);
184 }
185 
186 /***********************
187  * The main() function *
188  ***********************/
189 
190 #ifdef KSTRING_MAIN
191 #include <stdio.h>
192 int main()
193 {
194  kstring_t *s;
195  int *fields, n, i;
196  ks_tokaux_t aux;
197  char *p;
198  s = (kstring_t*)calloc(1, sizeof(kstring_t));
199  // test ksprintf()
200  ksprintf(s, " abcdefg: %d ", 100);
201  printf("'%s'\n", s->s);
202  // test ksplit()
203  fields = ksplit(s, 0, &n);
204  for (i = 0; i < n; ++i)
205  printf("field[%d] = '%s'\n", i, s->s + fields[i]);
206  // test kstrtok()
207  s->l = 0;
208  for (p = kstrtok("ab:cde:fg/hij::k", ":/", &aux); p; p = kstrtok(0, 0, &aux)) {
209  kputsn(p, aux.p - p, s);
210  kputc('\n', s);
211  }
212  printf("%s", s->s);
213  // free
214  free(s->s); free(s); free(fields);
215 
216  {
217  static char *str = "abcdefgcdgcagtcakcdcd";
218  static char *pat = "cd";
219  char *ret, *s = str;
220  int *prep = 0;
221  while ((ret = kstrstr(s, pat, &prep)) != 0) {
222  printf("match: %s\n", ret);
223  s = ret + prep[0];
224  }
225  free(prep);
226  }
227  return 0;
228 }
229 #endif