20 #define KSW_XBYTE 0x10000
21 #define KSW_XSTOP 0x20000
22 #define KSW_XSUBO 0x40000
23 #define KSW_XSTART 0x80000
35 __m128i *
qp, *H0, *H1, *E, *Hmax;
39 #define LIKELY(x) __builtin_expect((x),1)
40 #define UNLIKELY(x) __builtin_expect((x),0)
43 #define UNLIKELY(x) (x)
48 xx = _mm_max_epu8(xx, _mm_srli_si128(xx, 8));
49 xx = _mm_max_epu8(xx, _mm_srli_si128(xx, 4));
50 xx = _mm_max_epu8(xx, _mm_srli_si128(xx, 2));
51 xx = _mm_max_epu8(xx, _mm_srli_si128(xx, 1));
52 return _mm_extract_epi16((xx), 0) & 0x00ff;
72 size = size > 1? 2 : 1;
74 slen = (qlen + p - 1) / p;
75 q = (
kswq_t*)malloc(
sizeof(
kswq_t) + 256 + 16 * slen * (m + 4));
76 q->
qp = (__m128i*)(((
size_t)q +
sizeof(
kswq_t) + 15) >> 4 << 4);
77 q->
H0 = q->
qp + slen * m;
80 q->
Hmax = q->
E + slen;
84 for (a = 0, q->
shift = 127, q->
mdiff = 0; a < tmp; ++a) {
95 for (a = 0; a < m; ++a) {
96 int i, k, nlen = slen * p;
97 const int8_t *ma = mat + a * m;
98 for (i = 0; i < slen; ++i)
99 for (k = i; k < nlen; k += slen)
100 *t++ = (k >= qlen? 0 : ma[query[k]]) + q->
shift;
104 for (a = 0; a < m; ++a) {
105 int i, k, nlen = slen * p;
106 const int8_t *ma = mat + a * m;
107 for (i = 0; i < slen; ++i)
108 for (k = i; k < nlen; k += slen)
109 *t++ = (k >= qlen? 0 : ma[query[k]]);
117 const kswr_t g_defr = { 0, -1, -1, -1, -1, -1, -1 };
119 int slen, i, n_b, te = -1, gmax = 0, minsc, endsc;
121 __m128i zero, gapoe, gape, shift, *H0, *H1, *E, *Hmax;
125 __m128i lmem_H0[256/16];
126 __m128i lmem_H1[256/16];
127 __m128i lmem_E[256/16];
128 __m128i lmem_Hmax[256/16];
132 minsc = (xtra&
KSW_XSUBO)? xtra&0xffff : 0x10000;
133 endsc = (xtra&
KSW_XSTOP)? xtra&0xffff : 0x10000;
135 zero = _mm_set1_epi32(0);
136 gapoe = _mm_set1_epi8(_gapo + _gape);
137 gape = _mm_set1_epi8(_gape);
138 shift = _mm_set1_epi8(q->
shift);
140 H0 = lmem_H0; H1 = lmem_H1; E = lmem_E; Hmax = lmem_Hmax;
142 for (i = 0; i < slen; ++i) {
143 _mm_store_si128(E + i, zero);
144 _mm_store_si128(H0 + i, zero);
145 _mm_store_si128(Hmax + i, zero);
149 __m128i lmem_qp[4*256/16];
150 for (
int i = 0; i < slen*4; ++i)
151 lmem_qp[i] = q->
qp[i];
154 for (i = 0; i < tlen; ++i) {
156 __m128i e, h, f = zero,
max = zero;
158 __m128i* S = lmem_qp + target[i] * slen;
159 h = _mm_load_si128(H0 + slen - 1);
160 h = _mm_slli_si128(h, 1);
162 for (j = 0;
LIKELY(j < slen); ++j) {
170 h = _mm_adds_epu8(h, _mm_load_si128(S + j));
172 h = _mm_subs_epu8(h, shift);
173 e = _mm_load_si128(E + j);
174 h = _mm_max_epu8(h, e);
175 h = _mm_max_epu8(h, f);
176 max = _mm_max_epu8(max, h);
177 _mm_store_si128(H1 + j, h);
179 h = _mm_subs_epu8(h, gapoe);
180 e = _mm_subs_epu8(e, gape);
181 e = _mm_max_epu8(e, h);
182 _mm_store_si128(E + j, e);
184 f = _mm_subs_epu8(f, gape);
185 f = _mm_max_epu8(f, h);
187 h = _mm_load_si128(H0 + j);
191 for (k = 0;
LIKELY(k < 16); ++k) {
192 f = _mm_slli_si128(f, 1);
193 for (j = 0;
LIKELY(j < slen); ++j) {
194 h = _mm_load_si128(H1 + j);
195 h = _mm_max_epu8(h, f);
196 _mm_store_si128(H1 + j, h);
197 h = _mm_subs_epu8(h, gapoe);
198 f = _mm_subs_epu8(f, gape);
199 cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_subs_epu8(f, h), zero));
208 if (n_b == 0 || (
int32_t)b[n_b-1] + 1 != i) {
210 }
else if ((
int)(b[n_b-1]>>32) < imax) b[n_b-1] = (
uint64_t)imax<<32 | i;
214 for (j = 0;
LIKELY(j < slen); ++j)
215 _mm_store_si128(Hmax + j, _mm_load_si128(H1 + j));
216 if (gmax + q->
shift >= 255 || gmax >= endsc)
break;
218 S = H1; H1 = H0; H0 = S;
222 if (r.
score != 255) {
223 int max = -1, tmp, low, high, qlen = slen * 16;
225 for (i = 0; i < qlen; ++i, ++t)
226 if ((
int)*t >
max) max = *t, r.
qe = i / 16 + i % 16 * slen;
227 else if ((
int)*t == max && (tmp = i / 16 + i % 16 * slen) < r.
qe) r.
qe = tmp;
231 low = te - i; high = te + i;
232 for (i = 0; i < n_b; ++i) {
234 if ((e < low || e > high) && (int)(b[i]>>32) > r.
score2)
244 unsigned int abcd[4];
246 __m128i b = _mm_set_epi8( 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 );
248 _mm_store_si128( (__m128i*)abcd, b );
249 printf(
"w[0:3]: %X %X %X %X\n", abcd[0], abcd[1], abcd[2], abcd[3]);
251 const int h0 = _mm_extract_epi16( b, 0 );
252 printf(
"h0: %X\n", h0);
254 b = _mm_slli_si128( b, 1 );
255 _mm_store_si128( (__m128i*)abcd, b );
256 printf(
"w[0:3]: %X %X %X %X\n", abcd[0], abcd[1], abcd[2], abcd[3]);
258 b = _mm_set_epi8( 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 );
259 b = _mm_slli_si128( b, 4 );
260 _mm_store_si128( (__m128i*)abcd, b );
261 printf(
"w[0:3]: %X %X %X %X\n", abcd[0], abcd[1], abcd[2], abcd[3]);
263 b = _mm_set_epi8( 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 );
264 b = _mm_srli_si128( b, 1 );
265 _mm_store_si128( (__m128i*)abcd, b );
266 printf(
"w[0:3]: %X %X %X %X\n", abcd[0], abcd[1], abcd[2], abcd[3]);
276 std::vector<kswq_t*> qp( N );
277 std::vector<kswr_t> r( N );
278 std::vector<uint8_t> query( qlen * N,
uint8_t(0u) );
279 std::vector<uint8_t> target( tlen * N,
uint8_t(0u) );
280 std::vector<int8_t> mat( m * m, -4 );
287 const uint32 block_size = 128;
295 qp[i] =
ksw_qinit(1, qlen, &query[i*qlen], m, &mat[0] );
297 for (
uint32 j = 0; j < 10; ++j)
300 r[i] =
ksw_u8( qp[i], tlen, &target[i*tlen], -1, -1, -2 );
304 fprintf(stderr,
"cpu ksw rate: %.1f M/s (%.1f GCUPS)\n", 1.0e-6f *
float(10 * N)/timer.
seconds(), 1.0e-9f * float(qlen * tlen) * (float(10 * N)/timer.
seconds()));