NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
cpu.cpp
Go to the documentation of this file.
1 #include <nvbio/basic/types.h>
2 #include <nvbio/basic/numbers.h>
3 #include <nvbio/basic/timer.h>
4 #include <emmintrin.h>
5 #include <stdio.h>
6 #include <stdlib.h>
7 #include <vector>
8 
9 using namespace nvbio;
10 
11 typedef uint8 uint8_t;
12 typedef uint16 uint16_t;
13 typedef uint32 uint32_t;
14 typedef uint64 uint64_t;
15 typedef int8 int8_t;
16 typedef int16 int16_t;
17 typedef int32 int32_t;
18 typedef int64 int64_t;
19 
20 #define KSW_XBYTE 0x10000
21 #define KSW_XSTOP 0x20000
22 #define KSW_XSUBO 0x40000
23 #define KSW_XSTART 0x80000
24 
25 typedef struct {
26  int score; // best score
27  int te, qe; // target end and query end
28  int score2, te2; // second best score and ending position on the target
29  int tb, qb; // target start and query start
30 } kswr_t;
31 
32 struct kswq_t {
33  int qlen, slen;
34  uint8_t shift, mdiff, max, size;
35  __m128i *qp, *H0, *H1, *E, *Hmax;
36 };
37 
38 #ifdef __GNUC__
39 #define LIKELY(x) __builtin_expect((x),1)
40 #define UNLIKELY(x) __builtin_expect((x),0)
41 #else
42 #define LIKELY(x) (x)
43 #define UNLIKELY(x) (x)
44 #endif
45 
46 __device__ int __max_16(__m128i xx)
47 {
48  xx = _mm_max_epu8(xx, _mm_srli_si128(xx, 8));
49  xx = _mm_max_epu8(xx, _mm_srli_si128(xx, 4));
50  xx = _mm_max_epu8(xx, _mm_srli_si128(xx, 2));
51  xx = _mm_max_epu8(xx, _mm_srli_si128(xx, 1));
52  return _mm_extract_epi16((xx), 0) & 0x00ff;
53 }
54 
67 kswq_t* ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat)
68 {
69  kswq_t *q;
70  int slen, a, tmp, p;
71 
72  size = size > 1? 2 : 1;
73  p = 8 * (3 - size); // # values per __m128i
74  slen = (qlen + p - 1) / p; // segmented length
75  q = (kswq_t*)malloc(sizeof(kswq_t) + 256 + 16 * slen * (m + 4)); // a single block of memory
76  q->qp = (__m128i*)(((size_t)q + sizeof(kswq_t) + 15) >> 4 << 4); // align memory
77  q->H0 = q->qp + slen * m;
78  q->H1 = q->H0 + slen;
79  q->E = q->H1 + slen;
80  q->Hmax = q->E + slen;
81  q->slen = slen; q->qlen = qlen; q->size = size;
82  // compute shift
83  tmp = m * m;
84  for (a = 0, q->shift = 127, q->mdiff = 0; a < tmp; ++a) { // find the minimum and maximum score
85  if (mat[a] < (int8_t)q->shift) q->shift = mat[a];
86  if (mat[a] > (int8_t)q->mdiff) q->mdiff = mat[a];
87  }
88  q->max = q->mdiff;
89  q->shift = 256 - q->shift; // NB: q->shift is uint8_t
90  q->mdiff += q->shift; // this is the difference between the min and max scores
91  // An example: p=8, qlen=19, slen=3 and segmentation:
92  // {{0,3,6,9,12,15,18,-1},{1,4,7,10,13,16,-1,-1},{2,5,8,11,14,17,-1,-1}}
93  if (size == 1) {
94  int8_t *t = (int8_t*)q->qp;
95  for (a = 0; a < m; ++a) {
96  int i, k, nlen = slen * p;
97  const int8_t *ma = mat + a * m;
98  for (i = 0; i < slen; ++i)
99  for (k = i; k < nlen; k += slen) // p iterations
100  *t++ = (k >= qlen? 0 : ma[query[k]]) + q->shift;
101  }
102  } else {
103  int16_t *t = (int16_t*)q->qp;
104  for (a = 0; a < m; ++a) {
105  int i, k, nlen = slen * p;
106  const int8_t *ma = mat + a * m;
107  for (i = 0; i < slen; ++i)
108  for (k = i; k < nlen; k += slen) // p iterations
109  *t++ = (k >= qlen? 0 : ma[query[k]]);
110  }
111  }
112  return q;
113 }
114 
115 kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, int xtra) // the first gap costs -(_o+_e)
116 {
117  const kswr_t g_defr = { 0, -1, -1, -1, -1, -1, -1 };
118 
119  int slen, i, n_b, te = -1, gmax = 0, minsc, endsc;
120  uint64_t b[256];
121  __m128i zero, gapoe, gape, shift, *H0, *H1, *E, *Hmax;
122  kswr_t r;
123 
124  // keep hot arrays in local memory
125  __m128i lmem_H0[256/16];
126  __m128i lmem_H1[256/16];
127  __m128i lmem_E[256/16];
128  __m128i lmem_Hmax[256/16];
129 
130  // initialization
131  r = g_defr;
132  minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000;
133  endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000;
134  n_b = 0;
135  zero = _mm_set1_epi32(0);
136  gapoe = _mm_set1_epi8(_gapo + _gape);
137  gape = _mm_set1_epi8(_gape);
138  shift = _mm_set1_epi8(q->shift);
139  //H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax;
140  H0 = lmem_H0; H1 = lmem_H1; E = lmem_E; Hmax = lmem_Hmax;
141  slen = q->slen;
142  for (i = 0; i < slen; ++i) {
143  _mm_store_si128(E + i, zero);
144  _mm_store_si128(H0 + i, zero);
145  _mm_store_si128(Hmax + i, zero);
146  }
147 
148  // copy the query-profile to local memory
149  __m128i lmem_qp[4*256/16];
150  for (int i = 0; i < slen*4; ++i)
151  lmem_qp[i] = q->qp[i];
152 
153  // the core loop
154  for (i = 0; i < tlen; ++i) {
155  int j, k, cmp, imax;
156  __m128i e, h, f = zero, max = zero;
157  //__m128i* S = q->qp + target[i] * slen; // s is the 1st score vector
158  __m128i* S = lmem_qp + target[i] * slen; // s is the 1st score vector
159  h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
160  h = _mm_slli_si128(h, 1); // h=H(i-1,-1); << instead of >> because x64 is little-endian
161 
162  for (j = 0; LIKELY(j < slen); ++j) {
163  /* SW cells are computed in the following order:
164  * H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)}
165  * E(i+1,j) = max{H(i,j)-q, E(i,j)-r}
166  * F(i,j+1) = max{H(i,j)-q, F(i,j)-r}
167  */
168  // compute H'(i,j); note that at the beginning, h=H'(i-1,j-1)
169  //h = _mm_adds_epu8(h, _mm_cached_load_si128(S + j));
170  h = _mm_adds_epu8(h, _mm_load_si128(S + j));
171 
172  h = _mm_subs_epu8(h, shift); // h=H'(i-1,j-1)+S(i,j)
173  e = _mm_load_si128(E + j); // e=E'(i,j)
174  h = _mm_max_epu8(h, e);
175  h = _mm_max_epu8(h, f); // h=H'(i,j)
176  max = _mm_max_epu8(max, h); // set max
177  _mm_store_si128(H1 + j, h); // save to H'(i,j)
178  // now compute E'(i+1,j)
179  h = _mm_subs_epu8(h, gapoe); // h=H'(i,j)-gapo
180  e = _mm_subs_epu8(e, gape); // e=E'(i,j)-gape
181  e = _mm_max_epu8(e, h); // e=E'(i+1,j)
182  _mm_store_si128(E + j, e); // save to E'(i+1,j)
183  // now compute F'(i,j+1)
184  f = _mm_subs_epu8(f, gape);
185  f = _mm_max_epu8(f, h);
186  // get H'(i-1,j) and prepare for the next j
187  h = _mm_load_si128(H0 + j); // h=H'(i-1,j)
188  }
189 
190  // NB: we do not need to set E(i,j) as we disallow adjecent insertion and then deletion
191  for (k = 0; LIKELY(k < 16); ++k) { // this block mimics SWPS3; NB: H(i,j) updated in the lazy-F loop cannot exceed max
192  f = _mm_slli_si128(f, 1);
193  for (j = 0; LIKELY(j < slen); ++j) {
194  h = _mm_load_si128(H1 + j);
195  h = _mm_max_epu8(h, f); // h=H'(i,j)
196  _mm_store_si128(H1 + j, h);
197  h = _mm_subs_epu8(h, gapoe);
198  f = _mm_subs_epu8(f, gape);
199  cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_subs_epu8(f, h), zero));
200  if (UNLIKELY(cmp == 0xffff)) break;
201  }
202  if (UNLIKELY(cmp == 0xffff)) break;
203  }
204 
205  //int k;for (k=0;k<16;++k)printf("%d ", ((uint8_t*)&max)[k]);printf("\n");
206  imax = __max_16(max); // imax is the maximum number in max
207  if (imax >= minsc) { // write the b array; this condition adds branching unfornately
208  if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { // then append
209  b[n_b++] = (uint64_t)imax<<32 | i;
210  } else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last
211  }
212  if (imax > gmax) {
213  gmax = imax; te = i; // te is the end position on the target
214  for (j = 0; LIKELY(j < slen); ++j) // keep the H1 vector
215  _mm_store_si128(Hmax + j, _mm_load_si128(H1 + j));
216  if (gmax + q->shift >= 255 || gmax >= endsc) break;
217  }
218  S = H1; H1 = H0; H0 = S; // swap H0 and H1
219  }
220  r.score = gmax + q->shift < 255? gmax : 255;
221  r.te = te;
222  if (r.score != 255) { // get a->qe, the end of query match; find the 2nd best score
223  int max = -1, tmp, low, high, qlen = slen * 16;
224  uint8_t *t = (uint8_t*)Hmax;
225  for (i = 0; i < qlen; ++i, ++t)
226  if ((int)*t > max) max = *t, r.qe = i / 16 + i % 16 * slen;
227  else if ((int)*t == max && (tmp = i / 16 + i % 16 * slen) < r.qe) r.qe = tmp;
228  //printf("%d,%d\n", max, gmax);
229  if (b) {
230  i = (r.score + q->max - 1) / q->max;
231  low = te - i; high = te + i;
232  for (i = 0; i < n_b; ++i) {
233  int e = (int32_t)b[i];
234  if ((e < low || e > high) && (int)(b[i]>>32) > r.score2)
235  r.score2 = b[i]>>32, r.te2 = e;
236  }
237  }
238  }
239  return r;
240 }
241 
242 void cpu_test()
243 {
244  unsigned int abcd[4];
245 
246  __m128i b = _mm_set_epi8( 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 );
247 
248  _mm_store_si128( (__m128i*)abcd, b );
249  printf("w[0:3]: %X %X %X %X\n", abcd[0], abcd[1], abcd[2], abcd[3]);
250 
251  const int h0 = _mm_extract_epi16( b, 0 );
252  printf("h0: %X\n", h0);
253 
254  b = _mm_slli_si128( b, 1 );
255  _mm_store_si128( (__m128i*)abcd, b );
256  printf("w[0:3]: %X %X %X %X\n", abcd[0], abcd[1], abcd[2], abcd[3]);
257 
258  b = _mm_set_epi8( 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 );
259  b = _mm_slli_si128( b, 4 );
260  _mm_store_si128( (__m128i*)abcd, b );
261  printf("w[0:3]: %X %X %X %X\n", abcd[0], abcd[1], abcd[2], abcd[3]);
262 
263  b = _mm_set_epi8( 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 );
264  b = _mm_srli_si128( b, 1 );
265  _mm_store_si128( (__m128i*)abcd, b );
266  printf("w[0:3]: %X %X %X %X\n", abcd[0], abcd[1], abcd[2], abcd[3]);
267 }
268 
270 {
271  const uint32_t N = 100*1000;
272  const uint32 qlen = 150;
273  const uint32 tlen = 200;
274  const uint32 m = 4;
275 
276  std::vector<kswq_t*> qp( N );
277  std::vector<kswr_t> r( N );
278  std::vector<uint8_t> query( qlen * N, uint8_t(0u) );
279  std::vector<uint8_t> target( tlen * N, uint8_t(0u) );
280  std::vector<int8_t> mat( m * m, -4 );
281 
282  mat[0 ] = 1;
283  mat[1*m+1] = 1;
284  mat[2*m+2] = 1;
285  mat[3*m+3] = 1;
286 
287  const uint32 block_size = 128;
288  const uint32 n_blocks = util::divide_ri( N, 128 );
289 
290  nvbio::Timer timer;
291  timer.start();
292 
293  {
294  for (uint32_t i = 0; i < N; ++i)
295  qp[i] = ksw_qinit(1, qlen, &query[i*qlen], m, &mat[0] );
296 
297  for (uint32 j = 0; j < 10; ++j)
298  {
299  for (uint32_t i = 0; i < N; ++i)
300  r[i] = ksw_u8( qp[i], tlen, &target[i*tlen], -1, -1, -2 );
301  }
302  }
303 
304  fprintf(stderr, "cpu ksw rate: %.1f M/s (%.1f GCUPS)\n", 1.0e-6f * float(10 * N)/timer.seconds(), 1.0e-9f * float(qlen * tlen) * (float(10 * N)/timer.seconds()));
305 }