38 #include <emmintrin.h>
46 #define LIKELY(x) __builtin_expect((x),1)
47 #define UNLIKELY(x) __builtin_expect((x),0)
50 #define UNLIKELY(x) (x)
54 #define set_u(u, w, i, j) { int x=(i)-(w); x=x>0?x:0; (u)=(j)-x+1; }
57 #define set_d(u, w, i, j, p) { int x=(i)-(w); x=x>0?x:0; x=(j)-x; (u)=x*3+p; }
64 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
94 int32_t segLen = (readLen + 15) / 16;
98 __m128i* vProfile = (__m128i*)malloc(n * segLen *
sizeof(__m128i));
103 for (nt = 0;
LIKELY(nt < n); nt ++) {
104 for (i = 0; i < segLen; i ++) {
106 for (segNum = 0;
LIKELY(segNum < 16) ; segNum ++) {
107 *t++ = j>= readLen ? bias : mat[nt * n + read_num[j]] + bias;
136 #define max16(m, vm) (vm) = _mm_max_epu8((vm), _mm_srli_si128((vm), 8)); \
137 (vm) = _mm_max_epu8((vm), _mm_srli_si128((vm), 4)); \
138 (vm) = _mm_max_epu8((vm), _mm_srli_si128((vm), 2)); \
139 (vm) = _mm_max_epu8((vm), _mm_srli_si128((vm), 1)); \
140 (m) = _mm_extract_epi16((vm), 0)
143 int32_t end_read = readLen - 1;
145 int32_t segLen = (readLen + 15) / 16;
154 __m128i vZero = _mm_set1_epi32(0);
156 __m128i* pvHStore = (__m128i*) calloc(segLen,
sizeof(__m128i));
157 __m128i* pvHLoad = (__m128i*) calloc(segLen,
sizeof(__m128i));
158 __m128i* pvE = (__m128i*) calloc(segLen,
sizeof(__m128i));
159 __m128i* pvHmax = (__m128i*) calloc(segLen,
sizeof(__m128i));
163 __m128i vGapO = _mm_set1_epi8(weight_gapO);
166 __m128i vGapE = _mm_set1_epi8(weight_gapE);
169 __m128i vBias = _mm_set1_epi8(bias);
171 __m128i vMaxScore = vZero;
172 __m128i vMaxMark = vZero;
185 for (i = begin;
LIKELY(i != end); i += step) {
187 __m128i e = vZero, vF = vZero, vMaxColumn = vZero;
193 __m128i vH = pvHStore[segLen - 1];
194 vH = _mm_slli_si128 (vH, 1);
195 __m128i* vP = vProfile + ref[i] * segLen;
198 __m128i* pv = pvHLoad;
203 for (j = 0;
LIKELY(j < segLen); ++j) {
204 vH = _mm_adds_epu8(vH, _mm_load_si128(vP + j));
205 vH = _mm_subs_epu8(vH, vBias);
213 e = _mm_load_si128(pvE + j);
214 vH = _mm_max_epu8(vH, e);
215 vH = _mm_max_epu8(vH, vF);
216 vMaxColumn = _mm_max_epu8(vMaxColumn, vH);
223 _mm_store_si128(pvHStore + j, vH);
226 vH = _mm_subs_epu8(vH, vGapO);
227 e = _mm_subs_epu8(e, vGapE);
228 e = _mm_max_epu8(e, vH);
229 _mm_store_si128(pvE + j, e);
232 vF = _mm_subs_epu8(vF, vGapE);
233 vF = _mm_max_epu8(vF, vH);
236 vH = _mm_load_si128(pvHLoad + j);
242 vH = _mm_load_si128 (pvHStore + j);
247 vF = _mm_slli_si128 (vF, 1);
248 vTemp = _mm_subs_epu8 (vH, vGapO);
249 vTemp = _mm_subs_epu8 (vF, vTemp);
250 vTemp = _mm_cmpeq_epi8 (vTemp, vZero);
251 cmp = _mm_movemask_epi8 (vTemp);
253 while (cmp != 0xffff)
255 vH = _mm_max_epu8 (vH, vF);
256 vMaxColumn = _mm_max_epu8(vMaxColumn, vH);
257 _mm_store_si128 (pvHStore + j, vH);
258 vF = _mm_subs_epu8 (vF, vGapE);
263 vF = _mm_slli_si128 (vF, 1);
265 vH = _mm_load_si128 (pvHStore + j);
267 vTemp = _mm_subs_epu8 (vH, vGapO);
268 vTemp = _mm_subs_epu8 (vF, vTemp);
269 vTemp = _mm_cmpeq_epi8 (vTemp, vZero);
270 cmp = _mm_movemask_epi8 (vTemp);
273 vMaxScore = _mm_max_epu8(vMaxScore, vMaxColumn);
274 vTemp = _mm_cmpeq_epi8(vMaxMark, vMaxScore);
275 cmp = _mm_movemask_epi8(vTemp);
278 vMaxMark = vMaxScore;
279 max16(temp, vMaxScore);
280 vMaxScore = vMaxMark;
284 if (max + bias >= 255)
break;
288 for (j = 0;
LIKELY(j < segLen); ++j) pvHmax[j] = pvHStore[j];
293 max16(maxColumn[i], vMaxColumn);
295 if (maxColumn[i] == terminate)
break;
300 int32_t column_len = segLen * 16;
301 for (i = 0;
LIKELY(i < column_len); ++i, ++t) {
304 temp = i / 16 + i % 16 * segLen;
305 if (temp < end_read) end_read = temp;
316 bests[0].
score = max + bias >= 255 ? 255 :
max;
317 bests[0].
ref = end_ref;
318 bests[0].
read = end_read;
324 edge = (end_ref - maskLen) > 0 ? (end_ref - maskLen) : 0;
325 for (i = 0; i < edge; i ++) {
327 if (maxColumn[i] > bests[1].score) {
328 bests[1].
score = maxColumn[i];
332 edge = (end_ref + maskLen) > refLen ? refLen : (end_ref + maskLen);
333 for (i = edge + 1; i < refLen; i ++) {
335 if (maxColumn[i] > bests[1].score) {
336 bests[1].
score = maxColumn[i];
342 free(end_read_column);
351 int32_t segLen = (readLen + 7) / 8;
352 __m128i* vProfile = (__m128i*)malloc(n * segLen *
sizeof(__m128i));
358 for (nt = 0;
LIKELY(nt < n); nt ++) {
359 for (i = 0; i < segLen; i ++) {
361 for (segNum = 0;
LIKELY(segNum < 8) ; segNum ++) {
362 *t++ = j>= readLen ? 0 : mat[nt * n + read_num[j]];
380 #define max8(m, vm) (vm) = _mm_max_epi16((vm), _mm_srli_si128((vm), 8)); \
381 (vm) = _mm_max_epi16((vm), _mm_srli_si128((vm), 4)); \
382 (vm) = _mm_max_epi16((vm), _mm_srli_si128((vm), 2)); \
383 (m) = _mm_extract_epi16((vm), 0)
386 int32_t end_read = readLen - 1;
388 int32_t segLen = (readLen + 7) / 8;
397 __m128i vZero = _mm_set1_epi32(0);
399 __m128i* pvHStore = (__m128i*) calloc(segLen,
sizeof(__m128i));
400 __m128i* pvHLoad = (__m128i*) calloc(segLen,
sizeof(__m128i));
401 __m128i* pvE = (__m128i*) calloc(segLen,
sizeof(__m128i));
402 __m128i* pvHmax = (__m128i*) calloc(segLen,
sizeof(__m128i));
406 __m128i vGapO = _mm_set1_epi16(weight_gapO);
409 __m128i vGapE = _mm_set1_epi16(weight_gapE);
412 __m128i vMaxScore = vZero;
413 __m128i vMaxMark = vZero;
423 for (i = begin;
LIKELY(i != end); i += step) {
425 __m128i e = vZero, vF = vZero;
428 __m128i vH = pvHStore[segLen - 1];
429 vH = _mm_slli_si128 (vH, 2);
432 __m128i* pv = pvHLoad;
434 __m128i vMaxColumn = vZero;
436 __m128i* vP = vProfile + ref[i] * segLen;
441 for (j = 0;
LIKELY(j < segLen); j ++) {
442 vH = _mm_adds_epi16(vH, _mm_load_si128(vP + j));
445 e = _mm_load_si128(pvE + j);
446 vH = _mm_max_epi16(vH, e);
447 vH = _mm_max_epi16(vH, vF);
448 vMaxColumn = _mm_max_epi16(vMaxColumn, vH);
451 _mm_store_si128(pvHStore + j, vH);
454 vH = _mm_subs_epu16(vH, vGapO);
455 e = _mm_subs_epu16(e, vGapE);
456 e = _mm_max_epi16(e, vH);
457 _mm_store_si128(pvE + j, e);
460 vF = _mm_subs_epu16(vF, vGapE);
461 vF = _mm_max_epi16(vF, vH);
464 vH = _mm_load_si128(pvHLoad + j);
468 for (k = 0;
LIKELY(k < 8); ++k) {
469 vF = _mm_slli_si128 (vF, 2);
470 for (j = 0;
LIKELY(j < segLen); ++j) {
471 vH = _mm_load_si128(pvHStore + j);
472 vH = _mm_max_epi16(vH, vF);
473 _mm_store_si128(pvHStore + j, vH);
474 vH = _mm_subs_epu16(vH, vGapO);
475 vF = _mm_subs_epu16(vF, vGapE);
476 if (
UNLIKELY(! _mm_movemask_epi8(_mm_cmpgt_epi16(vF, vH))))
goto end;
481 vMaxScore = _mm_max_epi16(vMaxScore, vMaxColumn);
482 vTemp = _mm_cmpeq_epi16(vMaxMark, vMaxScore);
483 cmp = _mm_movemask_epi8(vTemp);
486 vMaxMark = vMaxScore;
487 max8(temp, vMaxScore);
488 vMaxScore = vMaxMark;
493 for (j = 0;
LIKELY(j < segLen); ++j) pvHmax[j] = pvHStore[j];
498 max8(maxColumn[i], vMaxColumn);
499 if (maxColumn[i] == terminate)
break;
504 int32_t column_len = segLen * 8;
505 for (i = 0;
LIKELY(i < column_len); ++i, ++t) {
508 temp = i / 8 + i % 8 * segLen;
509 if (temp < end_read) end_read = temp;
521 bests[0].
ref = end_ref;
522 bests[0].
read = end_read;
528 edge = (end_ref - maskLen) > 0 ? (end_ref - maskLen) : 0;
529 for (i = 0; i < edge; i ++) {
530 if (maxColumn[i] > bests[1].score) {
531 bests[1].
score = maxColumn[i];
535 edge = (end_ref + maskLen) > refLen ? refLen : (end_ref + maskLen);
536 for (i = edge; i < refLen; i ++) {
537 if (maxColumn[i] > bests[1].score) {
538 bests[1].
score = maxColumn[i];
544 free(end_read_column);
560 int32_t i, j, e, f, temp1, temp2, s = 16, s1 = 8, s2 = 1024, l,
max = 0;
562 int8_t *direction, *direction_line;
570 width = band_width * 2 + 3, width_d = band_width * 2 + 1;
571 while (width >= s1) {
578 while (width_d * readLen * 3 >= s2) {
582 fprintf(stderr,
"Alignment score and position are not consensus.\n");
585 direction = (
int8_t*)realloc(direction, s2 *
sizeof(
int8_t));
587 direction_line = direction;
588 for (j = 1;
LIKELY(j < width - 1); j ++) h_b[j] = 0;
589 for (i = 0;
LIKELY(i < readLen); i ++) {
590 int32_t beg = 0, end = refLen - 1, u = 0, edge;
591 j = i - band_width; beg = beg > j ? beg : j;
592 j = i + band_width; end = end < j ? end : j;
593 edge = end + 1 < width - 1 ? end + 1 : width - 1;
594 f = h_b[0] = e_b[0] = h_b[edge] = e_b[edge] = h_c[0] = 0;
595 direction_line = direction + width_d * i * 3;
597 for (j = beg;
LIKELY(j <= end); j ++) {
598 int32_t b, e1, f1, d, de, df, dh;
599 set_u(u, band_width, i, j);
set_u(e, band_width, i - 1, j);
600 set_u(b, band_width, i, j - 1);
set_u(d, band_width, i - 1, j - 1);
601 set_d(de, band_width, i, j, 0);
602 set_d(df, band_width, i, j, 1);
603 set_d(dh, band_width, i, j, 2);
605 temp1 = i == 0 ? -weight_gapO : h_b[e] - weight_gapO;
606 temp2 = i == 0 ? -weight_gapE : e_b[e] - weight_gapE;
607 e_b[u] = temp1 > temp2 ? temp1 : temp2;
608 direction_line[de] = temp1 > temp2 ? 3 : 2;
610 temp1 = h_c[b] - weight_gapO;
611 temp2 = f - weight_gapE;
612 f = temp1 > temp2 ? temp1 : temp2;
613 direction_line[df] = temp1 > temp2 ? 5 : 4;
615 e1 = e_b[u] > 0 ? e_b[u] : 0;
617 temp1 = e1 > f1 ? e1 : f1;
618 temp2 = h_b[d] + mat[ref[j] * n + read[i]];
619 h_c[u] = temp1 > temp2 ? temp1 : temp2;
621 if (h_c[u] > max) max = h_c[u];
623 if (temp1 <= temp2) direction_line[dh] = 1;
624 else direction_line[dh] = e1 > f1 ? direction_line[de] : direction_line[df];
626 for (j = 1; j <= u; j ++) h_b[j] = h_c[j];
629 }
while (
LIKELY(max < score));
640 set_d(temp1, band_width, i, j, temp2);
641 switch (direction_line[temp1]) {
646 direction_line -= width_d * 3;
652 direction_line -= width_d * 3;
658 direction_line -= width_d * 3;
672 fprintf(stderr,
"Trace back error: %d.\n", direction_line[temp1 - 1]);
732 while (
LIKELY(start <= end)) {
733 reverse[start] = seq[end];
734 reverse[end] = seq[start];
747 if (score_size == 0 || score_size == 2) {
750 for (i = 0; i < n*n; i++)
if (mat[i] < bias) bias = mat[i];
791 fprintf(stderr,
"When maskLen < 15, the function ssw_align doesn't return 2nd best alignment information.\n");
801 }
else if (bests[0].score == 255) {
802 fprintf(stderr,
"Please set 2 to the score_size parameter of the function ssw_init, otherwise the alignment results will be incorrect.\n");
809 fprintf(stderr,
"Please call the function ssw_init before ssw_align.\n");
823 if (flag == 0 || (flag == 2 && r->
score1 < filters))
goto end;
844 band_width = abs(refLen - readLen) + 1;
846 if (path == 0) r = 0;