NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
ssw.cpp
Go to the documentation of this file.
1 /* The MIT License
2 
3  Copyright (c) 2012-1015 Boston College.
4 
5  Permission is hereby granted, free of charge, to any person obtaining
6  a copy of this software and associated documentation files (the
7  "Software"), to deal in the Software without restriction, including
8  without limitation the rights to use, copy, modify, merge, publish,
9  distribute, sublicense, and/or sell copies of the Software, and to
10  permit persons to whom the Software is furnished to do so, subject to
11  the following conditions:
12 
13  The above copyright notice and this permission notice shall be
14  included in all copies or substantial portions of the Software.
15 
16  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20  BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21  ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22  CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23  SOFTWARE.
24 */
25 
26 /* Contact: Mengyao Zhao <zhangmp@bc.edu> */
27 
28 /*
29  * ssw.c
30  *
31  * Created by Mengyao Zhao on 6/22/10.
32  * Copyright 2010 Boston College. All rights reserved.
33  * Version 0.1.4
34  * Last revision by Mengyao Zhao on 12/07/12.
35  *
36  */
37 
38 #include <emmintrin.h>
39 #include <stdlib.h>
40 #include <stdio.h>
41 #include <string.h>
42 #include <math.h>
43 #include "ssw.h"
44 
45 #ifdef __GNUC__
46 #define LIKELY(x) __builtin_expect((x),1)
47 #define UNLIKELY(x) __builtin_expect((x),0)
48 #else
49 #define LIKELY(x) (x)
50 #define UNLIKELY(x) (x)
51 #endif
52 
53 /* Convert the coordinate in the scoring matrix into the coordinate in one line of the band. */
54 #define set_u(u, w, i, j) { int x=(i)-(w); x=x>0?x:0; (u)=(j)-x+1; }
55 
56 /* Convert the coordinate in the direction matrix into the coordinate in one line of the band. */
57 #define set_d(u, w, i, j, p) { int x=(i)-(w); x=x>0?x:0; x=(j)-x; (u)=x*3+p; }
58 
64 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
65 
66 typedef struct {
68  int32_t ref; //0-based position
69  int32_t read; //alignment ending position on read, 0-based
71 
72 typedef struct {
75 } cigar;
76 
77 struct _profile{
78  __m128i* profile_byte; // 0: none
79  __m128i* profile_word; // 0: none
80  const int8_t* read;
81  const int8_t* mat;
85 };
86 
87 /* Generate query profile rearrange query sequence & calculate the weight of match/mismatch. */
88 __m128i* qP_byte (const int8_t* read_num,
89  const int8_t* mat,
90  const int32_t readLen,
91  const int32_t n, /* the edge length of the squre matrix mat */
92  uint8_t bias) {
93 
94  int32_t segLen = (readLen + 15) / 16; /* Split the 128 bit register into 16 pieces.
95  Each piece is 8 bit. Split the read into 16 segments.
96  Calculat 16 segments in parallel.
97  */
98  __m128i* vProfile = (__m128i*)malloc(n * segLen * sizeof(__m128i));
99  int8_t* t = (int8_t*)vProfile;
100  int32_t nt, i, j, segNum;
101 
102  /* Generate query profile rearrange query sequence & calculate the weight of match/mismatch */
103  for (nt = 0; LIKELY(nt < n); nt ++) {
104  for (i = 0; i < segLen; i ++) {
105  j = i;
106  for (segNum = 0; LIKELY(segNum < 16) ; segNum ++) {
107  *t++ = j>= readLen ? bias : mat[nt * n + read_num[j]] + bias;
108  j += segLen;
109  }
110  }
111  }
112  return vProfile;
113 }
114 
115 /* Striped Smith-Waterman
116  Record the highest score of each reference position.
117  Return the alignment score and ending position of the best alignment, 2nd best alignment, etc.
118  Gap begin and gap extension are different.
119  wight_match > 0, all other weights < 0.
120  The returned positions are 0-based.
121  */
123  int8_t ref_dir, // 0: forward ref; 1: reverse ref
124  int32_t refLen,
125  int32_t readLen,
126  const uint8_t weight_gapO, /* will be used as - */
127  const uint8_t weight_gapE, /* will be used as - */
128  __m128i* vProfile,
129  uint8_t terminate, /* the best alignment score: used to terminate
130  the matrix calculation when locating the
131  alignment beginning point. If this score
132  is set to 0, it will not be used */
133  uint8_t bias, /* Shift 0 point to a positive value. */
134  int32_t maskLen) {
135 
136 #define max16(m, vm) (vm) = _mm_max_epu8((vm), _mm_srli_si128((vm), 8)); \
137  (vm) = _mm_max_epu8((vm), _mm_srli_si128((vm), 4)); \
138  (vm) = _mm_max_epu8((vm), _mm_srli_si128((vm), 2)); \
139  (vm) = _mm_max_epu8((vm), _mm_srli_si128((vm), 1)); \
140  (m) = _mm_extract_epi16((vm), 0)
141 
142  uint8_t max = 0; /* the max alignment score */
143  int32_t end_read = readLen - 1;
144  int32_t end_ref = -1; /* 0_based best alignment ending point; Initialized as isn't aligned -1. */
145  int32_t segLen = (readLen + 15) / 16; /* number of segment */
146 
147  /* array to record the largest score of each reference position */
148  uint8_t* maxColumn = (uint8_t*) calloc(refLen, 1);
149 
150  /* array to record the alignment read ending position of the largest score of each reference position */
151  int32_t* end_read_column = (int32_t*) calloc(refLen, sizeof(int32_t));
152 
153  /* Define 16 byte 0 vector. */
154  __m128i vZero = _mm_set1_epi32(0);
155 
156  __m128i* pvHStore = (__m128i*) calloc(segLen, sizeof(__m128i));
157  __m128i* pvHLoad = (__m128i*) calloc(segLen, sizeof(__m128i));
158  __m128i* pvE = (__m128i*) calloc(segLen, sizeof(__m128i));
159  __m128i* pvHmax = (__m128i*) calloc(segLen, sizeof(__m128i));
160 
161  int32_t i, j;
162  /* 16 byte insertion begin vector */
163  __m128i vGapO = _mm_set1_epi8(weight_gapO);
164 
165  /* 16 byte insertion extension vector */
166  __m128i vGapE = _mm_set1_epi8(weight_gapE);
167 
168  /* 16 byte bias vector */
169  __m128i vBias = _mm_set1_epi8(bias);
170 
171  __m128i vMaxScore = vZero; /* Trace the highest score of the whole SW matrix. */
172  __m128i vMaxMark = vZero; /* Trace the highest score till the previous column. */
173  __m128i vTemp;
174  int32_t edge, begin = 0, end = refLen, step = 1;
175 // int32_t distance = readLen * 2 / 3;
176 // int32_t distance = readLen / 2;
177 // int32_t distance = readLen;
178 
179  /* outer loop to process the reference sequence */
180  if (ref_dir == 1) {
181  begin = refLen - 1;
182  end = -1;
183  step = -1;
184  }
185  for (i = begin; LIKELY(i != end); i += step) {
186  int32_t cmp;
187  __m128i e = vZero, vF = vZero, vMaxColumn = vZero; /* Initialize F value to 0.
188  Any errors to vH values will be corrected in the Lazy_F loop.
189  */
190 // max16(maxColumn[i], vMaxColumn);
191 // fprintf(stderr, "middle[%d]: %d\n", i, maxColumn[i]);
192 
193  __m128i vH = pvHStore[segLen - 1];
194  vH = _mm_slli_si128 (vH, 1); /* Shift the 128-bit value in vH left by 1 byte. */
195  __m128i* vP = vProfile + ref[i] * segLen; /* Right part of the vProfile */
196 
197  /* Swap the 2 H buffers. */
198  __m128i* pv = pvHLoad;
199  pvHLoad = pvHStore;
200  pvHStore = pv;
201 
202  /* inner loop to process the query sequence */
203  for (j = 0; LIKELY(j < segLen); ++j) {
204  vH = _mm_adds_epu8(vH, _mm_load_si128(vP + j));
205  vH = _mm_subs_epu8(vH, vBias); /* vH will be always > 0 */
206  // max16(maxColumn[i], vH);
207  // fprintf(stderr, "H[%d]: %d\n", i, maxColumn[i]);
208 // int8_t* t;
209 // int32_t ti;
210 //for (t = (int8_t*)&vH, ti = 0; ti < 16; ++ti) fprintf(stderr, "%d\t", *t++);
211 
212  /* Get max from vH, vE and vF. */
213  e = _mm_load_si128(pvE + j);
214  vH = _mm_max_epu8(vH, e);
215  vH = _mm_max_epu8(vH, vF);
216  vMaxColumn = _mm_max_epu8(vMaxColumn, vH);
217 
218  // max16(maxColumn[i], vMaxColumn);
219  // fprintf(stderr, "middle[%d]: %d\n", i, maxColumn[i]);
220 // for (t = (int8_t*)&vMaxColumn, ti = 0; ti < 16; ++ti) fprintf(stderr, "%d\t", *t++);
221 
222  /* Save vH values. */
223  _mm_store_si128(pvHStore + j, vH);
224 
225  /* Update vE value. */
226  vH = _mm_subs_epu8(vH, vGapO); /* saturation arithmetic, result >= 0 */
227  e = _mm_subs_epu8(e, vGapE);
228  e = _mm_max_epu8(e, vH);
229  _mm_store_si128(pvE + j, e);
230 
231  /* Update vF value. */
232  vF = _mm_subs_epu8(vF, vGapE);
233  vF = _mm_max_epu8(vF, vH);
234 
235  /* Load the next vH. */
236  vH = _mm_load_si128(pvHLoad + j);
237  }
238 
239  /* Lazy_F loop: has been revised to disallow adjecent insertion and then deletion, so don't update E(i, j), learn from SWPS3 */
240  /* reset pointers to the start of the saved data */
241  j = 0;
242  vH = _mm_load_si128 (pvHStore + j);
243 
244  /* the computed vF value is for the given column. since */
245  /* we are at the end, we need to shift the vF value over */
246  /* to the next column. */
247  vF = _mm_slli_si128 (vF, 1);
248  vTemp = _mm_subs_epu8 (vH, vGapO);
249  vTemp = _mm_subs_epu8 (vF, vTemp);
250  vTemp = _mm_cmpeq_epi8 (vTemp, vZero);
251  cmp = _mm_movemask_epi8 (vTemp);
252 
253  while (cmp != 0xffff)
254  {
255  vH = _mm_max_epu8 (vH, vF);
256  vMaxColumn = _mm_max_epu8(vMaxColumn, vH);
257  _mm_store_si128 (pvHStore + j, vH);
258  vF = _mm_subs_epu8 (vF, vGapE);
259  j++;
260  if (j >= segLen)
261  {
262  j = 0;
263  vF = _mm_slli_si128 (vF, 1);
264  }
265  vH = _mm_load_si128 (pvHStore + j);
266 
267  vTemp = _mm_subs_epu8 (vH, vGapO);
268  vTemp = _mm_subs_epu8 (vF, vTemp);
269  vTemp = _mm_cmpeq_epi8 (vTemp, vZero);
270  cmp = _mm_movemask_epi8 (vTemp);
271  }
272 
273  vMaxScore = _mm_max_epu8(vMaxScore, vMaxColumn);
274  vTemp = _mm_cmpeq_epi8(vMaxMark, vMaxScore);
275  cmp = _mm_movemask_epi8(vTemp);
276  if (cmp != 0xffff) {
277  uint8_t temp;
278  vMaxMark = vMaxScore;
279  max16(temp, vMaxScore);
280  vMaxScore = vMaxMark;
281 
282  if (LIKELY(temp > max)) {
283  max = temp;
284  if (max + bias >= 255) break; //overflow
285  end_ref = i;
286 
287  /* Store the column with the highest alignment score in order to trace the alignment ending position on read. */
288  for (j = 0; LIKELY(j < segLen); ++j) pvHmax[j] = pvHStore[j];
289  }
290  }
291 
292  /* Record the max score of current column. */
293  max16(maxColumn[i], vMaxColumn);
294 // fprintf(stderr, "maxColumn[%d]: %d\n", i, maxColumn[i]);
295  if (maxColumn[i] == terminate) break;
296  }
297 
298  /* Trace the alignment ending position on read. */
299  uint8_t *t = (uint8_t*)pvHmax;
300  int32_t column_len = segLen * 16;
301  for (i = 0; LIKELY(i < column_len); ++i, ++t) {
302  int32_t temp;
303  if (*t == max) {
304  temp = i / 16 + i % 16 * segLen;
305  if (temp < end_read) end_read = temp;
306  }
307  }
308 
309  free(pvHmax);
310  free(pvE);
311  free(pvHLoad);
312  free(pvHStore);
313 
314  /* Find the most possible 2nd best alignment. */
315  alignment_end* bests = (alignment_end*) calloc(2, sizeof(alignment_end));
316  bests[0].score = max + bias >= 255 ? 255 : max;
317  bests[0].ref = end_ref;
318  bests[0].read = end_read;
319 
320  bests[1].score = 0;
321  bests[1].ref = 0;
322  bests[1].read = 0;
323 
324  edge = (end_ref - maskLen) > 0 ? (end_ref - maskLen) : 0;
325  for (i = 0; i < edge; i ++) {
326 // fprintf (stderr, "maxColumn[%d]: %d\n", i, maxColumn[i]);
327  if (maxColumn[i] > bests[1].score) {
328  bests[1].score = maxColumn[i];
329  bests[1].ref = i;
330  }
331  }
332  edge = (end_ref + maskLen) > refLen ? refLen : (end_ref + maskLen);
333  for (i = edge + 1; i < refLen; i ++) {
334 // fprintf (stderr, "refLen: %d\tmaxColumn[%d]: %d\n", refLen, i, maxColumn[i]);
335  if (maxColumn[i] > bests[1].score) {
336  bests[1].score = maxColumn[i];
337  bests[1].ref = i;
338  }
339  }
340 
341  free(maxColumn);
342  free(end_read_column);
343  return bests;
344 }
345 
346 __m128i* qP_word (const int8_t* read_num,
347  const int8_t* mat,
348  const int32_t readLen,
349  const int32_t n) {
350 
351  int32_t segLen = (readLen + 7) / 8;
352  __m128i* vProfile = (__m128i*)malloc(n * segLen * sizeof(__m128i));
353  int16_t* t = (int16_t*)vProfile;
354  int32_t nt, i, j;
355  int32_t segNum;
356 
357  /* Generate query profile rearrange query sequence & calculate the weight of match/mismatch */
358  for (nt = 0; LIKELY(nt < n); nt ++) {
359  for (i = 0; i < segLen; i ++) {
360  j = i;
361  for (segNum = 0; LIKELY(segNum < 8) ; segNum ++) {
362  *t++ = j>= readLen ? 0 : mat[nt * n + read_num[j]];
363  j += segLen;
364  }
365  }
366  }
367  return vProfile;
368 }
369 
371  int8_t ref_dir, // 0: forward ref; 1: reverse ref
372  int32_t refLen,
373  int32_t readLen,
374  const uint8_t weight_gapO, /* will be used as - */
375  const uint8_t weight_gapE, /* will be used as - */
376  __m128i* vProfile,
377  uint16_t terminate,
378  int32_t maskLen) {
379 
380 #define max8(m, vm) (vm) = _mm_max_epi16((vm), _mm_srli_si128((vm), 8)); \
381  (vm) = _mm_max_epi16((vm), _mm_srli_si128((vm), 4)); \
382  (vm) = _mm_max_epi16((vm), _mm_srli_si128((vm), 2)); \
383  (m) = _mm_extract_epi16((vm), 0)
384 
385  uint16_t max = 0; /* the max alignment score */
386  int32_t end_read = readLen - 1;
387  int32_t end_ref = 0; /* 1_based best alignment ending point; Initialized as isn't aligned - 0. */
388  int32_t segLen = (readLen + 7) / 8; /* number of segment */
389 
390  /* array to record the largest score of each reference position */
391  uint16_t* maxColumn = (uint16_t*) calloc(refLen, 2);
392 
393  /* array to record the alignment read ending position of the largest score of each reference position */
394  int32_t* end_read_column = (int32_t*) calloc(refLen, sizeof(int32_t));
395 
396  /* Define 16 byte 0 vector. */
397  __m128i vZero = _mm_set1_epi32(0);
398 
399  __m128i* pvHStore = (__m128i*) calloc(segLen, sizeof(__m128i));
400  __m128i* pvHLoad = (__m128i*) calloc(segLen, sizeof(__m128i));
401  __m128i* pvE = (__m128i*) calloc(segLen, sizeof(__m128i));
402  __m128i* pvHmax = (__m128i*) calloc(segLen, sizeof(__m128i));
403 
404  int32_t i, j, k;
405  /* 16 byte insertion begin vector */
406  __m128i vGapO = _mm_set1_epi16(weight_gapO);
407 
408  /* 16 byte insertion extension vector */
409  __m128i vGapE = _mm_set1_epi16(weight_gapE);
410 
411  /* 16 byte bias vector */
412  __m128i vMaxScore = vZero; /* Trace the highest score of the whole SW matrix. */
413  __m128i vMaxMark = vZero; /* Trace the highest score till the previous column. */
414  __m128i vTemp;
415  int32_t edge, begin = 0, end = refLen, step = 1;
416 
417  /* outer loop to process the reference sequence */
418  if (ref_dir == 1) {
419  begin = refLen - 1;
420  end = -1;
421  step = -1;
422  }
423  for (i = begin; LIKELY(i != end); i += step) {
424  int32_t cmp;
425  __m128i e = vZero, vF = vZero; /* Initialize F value to 0.
426  Any errors to vH values will be corrected in the Lazy_F loop.
427  */
428  __m128i vH = pvHStore[segLen - 1];
429  vH = _mm_slli_si128 (vH, 2); /* Shift the 128-bit value in vH left by 2 byte. */
430 
431  /* Swap the 2 H buffers. */
432  __m128i* pv = pvHLoad;
433 
434  __m128i vMaxColumn = vZero; /* vMaxColumn is used to record the max values of column i. */
435 
436  __m128i* vP = vProfile + ref[i] * segLen; /* Right part of the vProfile */
437  pvHLoad = pvHStore;
438  pvHStore = pv;
439 
440  /* inner loop to process the query sequence */
441  for (j = 0; LIKELY(j < segLen); j ++) {
442  vH = _mm_adds_epi16(vH, _mm_load_si128(vP + j));
443 
444  /* Get max from vH, vE and vF. */
445  e = _mm_load_si128(pvE + j);
446  vH = _mm_max_epi16(vH, e);
447  vH = _mm_max_epi16(vH, vF);
448  vMaxColumn = _mm_max_epi16(vMaxColumn, vH);
449 
450  /* Save vH values. */
451  _mm_store_si128(pvHStore + j, vH);
452 
453  /* Update vE value. */
454  vH = _mm_subs_epu16(vH, vGapO); /* saturation arithmetic, result >= 0 */
455  e = _mm_subs_epu16(e, vGapE);
456  e = _mm_max_epi16(e, vH);
457  _mm_store_si128(pvE + j, e);
458 
459  /* Update vF value. */
460  vF = _mm_subs_epu16(vF, vGapE);
461  vF = _mm_max_epi16(vF, vH);
462 
463  /* Load the next vH. */
464  vH = _mm_load_si128(pvHLoad + j);
465  }
466 
467  /* Lazy_F loop: has been revised to disallow adjecent insertion and then deletion, so don't update E(i, j), learn from SWPS3 */
468  for (k = 0; LIKELY(k < 8); ++k) {
469  vF = _mm_slli_si128 (vF, 2);
470  for (j = 0; LIKELY(j < segLen); ++j) {
471  vH = _mm_load_si128(pvHStore + j);
472  vH = _mm_max_epi16(vH, vF);
473  _mm_store_si128(pvHStore + j, vH);
474  vH = _mm_subs_epu16(vH, vGapO);
475  vF = _mm_subs_epu16(vF, vGapE);
476  if (UNLIKELY(! _mm_movemask_epi8(_mm_cmpgt_epi16(vF, vH)))) goto end;
477  }
478  }
479 
480 end:
481  vMaxScore = _mm_max_epi16(vMaxScore, vMaxColumn);
482  vTemp = _mm_cmpeq_epi16(vMaxMark, vMaxScore);
483  cmp = _mm_movemask_epi8(vTemp);
484  if (cmp != 0xffff) {
485  uint16_t temp;
486  vMaxMark = vMaxScore;
487  max8(temp, vMaxScore);
488  vMaxScore = vMaxMark;
489 
490  if (LIKELY(temp > max)) {
491  max = temp;
492  end_ref = i;
493  for (j = 0; LIKELY(j < segLen); ++j) pvHmax[j] = pvHStore[j];
494  }
495  }
496 
497  /* Record the max score of current column. */
498  max8(maxColumn[i], vMaxColumn);
499  if (maxColumn[i] == terminate) break;
500  }
501 
502  /* Trace the alignment ending position on read. */
503  uint16_t *t = (uint16_t*)pvHmax;
504  int32_t column_len = segLen * 8;
505  for (i = 0; LIKELY(i < column_len); ++i, ++t) {
506  int32_t temp;
507  if (*t == max) {
508  temp = i / 8 + i % 8 * segLen;
509  if (temp < end_read) end_read = temp;
510  }
511  }
512 
513  free(pvHmax);
514  free(pvE);
515  free(pvHLoad);
516  free(pvHStore);
517 
518  /* Find the most possible 2nd best alignment. */
519  alignment_end* bests = (alignment_end*) calloc(2, sizeof(alignment_end));
520  bests[0].score = max;
521  bests[0].ref = end_ref;
522  bests[0].read = end_read;
523 
524  bests[1].score = 0;
525  bests[1].ref = 0;
526  bests[1].read = 0;
527 
528  edge = (end_ref - maskLen) > 0 ? (end_ref - maskLen) : 0;
529  for (i = 0; i < edge; i ++) {
530  if (maxColumn[i] > bests[1].score) {
531  bests[1].score = maxColumn[i];
532  bests[1].ref = i;
533  }
534  }
535  edge = (end_ref + maskLen) > refLen ? refLen : (end_ref + maskLen);
536  for (i = edge; i < refLen; i ++) {
537  if (maxColumn[i] > bests[1].score) {
538  bests[1].score = maxColumn[i];
539  bests[1].ref = i;
540  }
541  }
542 
543  free(maxColumn);
544  free(end_read_column);
545  return bests;
546 }
547 
548 cigar* banded_sw (const int8_t* ref,
549  const int8_t* read,
550  int32_t refLen,
551  int32_t readLen,
552  int32_t score,
553  const uint32_t weight_gapO, /* will be used as - */
554  const uint32_t weight_gapE, /* will be used as - */
555  int32_t band_width,
556  const int8_t* mat, /* pointer to the weight matrix */
557  int32_t n) {
558 
559  uint32_t *c = (uint32_t*)malloc(16 * sizeof(uint32_t)), *c1;
560  int32_t i, j, e, f, temp1, temp2, s = 16, s1 = 8, s2 = 1024, l, max = 0;
561  int32_t width, width_d, *h_b, *e_b, *h_c;
562  int8_t *direction, *direction_line;
563  cigar* result = (cigar*)malloc(sizeof(cigar));
564  h_b = (int32_t*)malloc(s1 * sizeof(int32_t));
565  e_b = (int32_t*)malloc(s1 * sizeof(int32_t));
566  h_c = (int32_t*)malloc(s1 * sizeof(int32_t));
567  direction = (int8_t*)malloc(s2 * sizeof(int8_t));
568 
569  do {
570  width = band_width * 2 + 3, width_d = band_width * 2 + 1;
571  while (width >= s1) {
572  ++s1;
573  kroundup32(s1);
574  h_b = (int32_t*)realloc(h_b, s1 * sizeof(int32_t));
575  e_b = (int32_t*)realloc(e_b, s1 * sizeof(int32_t));
576  h_c = (int32_t*)realloc(h_c, s1 * sizeof(int32_t));
577  }
578  while (width_d * readLen * 3 >= s2) {
579  ++s2;
580  kroundup32(s2);
581  if (s2 < 0) {
582  fprintf(stderr, "Alignment score and position are not consensus.\n");
583  exit(1);
584  }
585  direction = (int8_t*)realloc(direction, s2 * sizeof(int8_t));
586  }
587  direction_line = direction;
588  for (j = 1; LIKELY(j < width - 1); j ++) h_b[j] = 0;
589  for (i = 0; LIKELY(i < readLen); i ++) {
590  int32_t beg = 0, end = refLen - 1, u = 0, edge;
591  j = i - band_width; beg = beg > j ? beg : j; // band start
592  j = i + band_width; end = end < j ? end : j; // band end
593  edge = end + 1 < width - 1 ? end + 1 : width - 1;
594  f = h_b[0] = e_b[0] = h_b[edge] = e_b[edge] = h_c[0] = 0;
595  direction_line = direction + width_d * i * 3;
596 
597  for (j = beg; LIKELY(j <= end); j ++) {
598  int32_t b, e1, f1, d, de, df, dh;
599  set_u(u, band_width, i, j); set_u(e, band_width, i - 1, j);
600  set_u(b, band_width, i, j - 1); set_u(d, band_width, i - 1, j - 1);
601  set_d(de, band_width, i, j, 0);
602  set_d(df, band_width, i, j, 1);
603  set_d(dh, band_width, i, j, 2);
604 
605  temp1 = i == 0 ? -weight_gapO : h_b[e] - weight_gapO;
606  temp2 = i == 0 ? -weight_gapE : e_b[e] - weight_gapE;
607  e_b[u] = temp1 > temp2 ? temp1 : temp2;
608  direction_line[de] = temp1 > temp2 ? 3 : 2;
609 
610  temp1 = h_c[b] - weight_gapO;
611  temp2 = f - weight_gapE;
612  f = temp1 > temp2 ? temp1 : temp2;
613  direction_line[df] = temp1 > temp2 ? 5 : 4;
614 
615  e1 = e_b[u] > 0 ? e_b[u] : 0;
616  f1 = f > 0 ? f : 0;
617  temp1 = e1 > f1 ? e1 : f1;
618  temp2 = h_b[d] + mat[ref[j] * n + read[i]];
619  h_c[u] = temp1 > temp2 ? temp1 : temp2;
620 
621  if (h_c[u] > max) max = h_c[u];
622 
623  if (temp1 <= temp2) direction_line[dh] = 1;
624  else direction_line[dh] = e1 > f1 ? direction_line[de] : direction_line[df];
625  }
626  for (j = 1; j <= u; j ++) h_b[j] = h_c[j];
627  }
628  band_width *= 2;
629  } while (LIKELY(max < score));
630  band_width /= 2;
631 
632  // trace back
633  i = readLen - 1;
634  j = refLen - 1;
635  e = 0; // Count the number of M, D or I.
636  l = 0; // record length of current cigar
637  f = max = 0; // M
638  temp2 = 2; // h
639  while (LIKELY(i > 0)) {
640  set_d(temp1, band_width, i, j, temp2);
641  switch (direction_line[temp1]) {
642  case 1:
643  --i;
644  --j;
645  temp2 = 2;
646  direction_line -= width_d * 3;
647  f = 0; // M
648  break;
649  case 2:
650  --i;
651  temp2 = 0; // e
652  direction_line -= width_d * 3;
653  f = 1; // I
654  break;
655  case 3:
656  --i;
657  temp2 = 2;
658  direction_line -= width_d * 3;
659  f = 1; // I
660  break;
661  case 4:
662  --j;
663  temp2 = 1;
664  f = 2; // D
665  break;
666  case 5:
667  --j;
668  temp2 = 2;
669  f = 2; // D
670  break;
671  default:
672  fprintf(stderr, "Trace back error: %d.\n", direction_line[temp1 - 1]);
673  return 0;
674  }
675  if (f == max) ++e;
676  else {
677  ++l;
678  while (l >= s) {
679  ++s;
680  kroundup32(s);
681  c = (uint32_t*)realloc(c, s * sizeof(uint32_t));
682  }
683  c[l - 1] = e<<4|max;
684  max = f;
685  e = 1;
686  }
687  }
688  if (f == 0) {
689  ++l;
690  while (l >= s) {
691  ++s;
692  kroundup32(s);
693  c = (uint32_t*)realloc(c, s * sizeof(uint32_t));
694  }
695  c[l - 1] = (e+1)<<4;
696  }else {
697  l += 2;
698  while (l >= s) {
699  ++s;
700  kroundup32(s);
701  c = (uint32_t*)realloc(c, s * sizeof(uint32_t));
702  }
703  c[l - 2] = e<<4|f;
704  c[l - 1] = 16; // 1M
705  }
706 
707  // reverse cigar
708  c1 = (uint32_t*)malloc(l * sizeof(uint32_t));
709  s = 0;
710  e = l - 1;
711  while (LIKELY(s <= e)) {
712  c1[s] = c[e];
713  c1[e] = c[s];
714  ++ s;
715  -- e;
716  }
717  result->seq = c1;
718  result->length = l;
719 
720  free(direction);
721  free(h_c);
722  free(e_b);
723  free(h_b);
724  free(c);
725  return result;
726 }
727 
728 int8_t* seq_reverse(const int8_t* seq, int32_t end) /* end is 0-based alignment ending position */
729 {
730  int8_t* reverse = (int8_t*)calloc(end + 1, sizeof(int8_t));
731  int32_t start = 0;
732  while (LIKELY(start <= end)) {
733  reverse[start] = seq[end];
734  reverse[end] = seq[start];
735  ++ start;
736  -- end;
737  }
738  return reverse;
739 }
740 
741 s_profile* ssw_init (const int8_t* read, const int32_t readLen, const int8_t* mat, const int32_t n, const int8_t score_size) {
742  s_profile* p = (s_profile*)calloc(1, sizeof(struct _profile));
743  p->profile_byte = 0;
744  p->profile_word = 0;
745  p->bias = 0;
746 
747  if (score_size == 0 || score_size == 2) {
748  /* Find the bias to use in the substitution matrix */
749  int32_t bias = 0, i;
750  for (i = 0; i < n*n; i++) if (mat[i] < bias) bias = mat[i];
751  bias = abs(bias);
752 
753  p->bias = bias;
754  p->profile_byte = qP_byte (read, mat, readLen, n, bias);
755  }
756  if (score_size == 1 || score_size == 2) p->profile_word = qP_word (read, mat, readLen, n);
757  p->read = read;
758  p->mat = mat;
759  p->readLen = readLen;
760  p->n = n;
761  return p;
762 }
763 
765  free(p->profile_byte);
766  free(p->profile_word);
767  free(p);
768 }
769 
770 s_align* ssw_align (const s_profile* prof,
771  const int8_t* ref,
772  int32_t refLen,
773  const uint8_t weight_gapO,
774  const uint8_t weight_gapE,
775  const uint8_t flag, // (from high to low) bit 5: return the best alignment beginning position; 6: if (ref_end1 - ref_begin1 <= filterd) && (read_end1 - read_begin1 <= filterd), return cigar; 7: if max score >= filters, return cigar; 8: always return cigar; if 6 & 7 are both setted, only return cigar when both filter fulfilled
776  const uint16_t filters,
777  const int32_t filterd,
778  const int32_t maskLen) {
779 
780  alignment_end* bests = 0, *bests_reverse = 0;
781  __m128i* vP = 0;
782  int32_t word = 0, band_width = 0, readLen = prof->readLen;
783  int8_t* read_reverse = 0;
784  cigar* path;
785  s_align* r = (s_align*)calloc(1, sizeof(s_align));
786  r->ref_begin1 = -1;
787  r->read_begin1 = -1;
788  r->cigar = 0;
789  r->cigarLen = 0;
790  if (maskLen < 15) {
791  fprintf(stderr, "When maskLen < 15, the function ssw_align doesn't return 2nd best alignment information.\n");
792  }
793 
794  // Find the alignment scores and ending positions
795  if (prof->profile_byte) {
796  bests = sw_sse2_byte(ref, 0, refLen, readLen, weight_gapO, weight_gapE, prof->profile_byte, -1, prof->bias, maskLen);
797  if (prof->profile_word && bests[0].score == 255) {
798  free(bests);
799  bests = sw_sse2_word(ref, 0, refLen, readLen, weight_gapO, weight_gapE, prof->profile_word, -1, maskLen);
800  word = 1;
801  } else if (bests[0].score == 255) {
802  fprintf(stderr, "Please set 2 to the score_size parameter of the function ssw_init, otherwise the alignment results will be incorrect.\n");
803  return 0;
804  }
805  }else if (prof->profile_word) {
806  bests = sw_sse2_word(ref, 0, refLen, readLen, weight_gapO, weight_gapE, prof->profile_word, -1, maskLen);
807  word = 1;
808  }else {
809  fprintf(stderr, "Please call the function ssw_init before ssw_align.\n");
810  return 0;
811  }
812  r->score1 = bests[0].score;
813  r->ref_end1 = bests[0].ref;
814  r->read_end1 = bests[0].read;
815  if (maskLen >= 15) {
816  r->score2 = bests[1].score;
817  r->ref_end2 = bests[1].ref;
818  } else {
819  r->score2 = 0;
820  r->ref_end2 = -1;
821  }
822  free(bests);
823  if (flag == 0 || (flag == 2 && r->score1 < filters)) goto end;
824 
825  // Find the beginning position of the best alignment.
826  read_reverse = seq_reverse(prof->read, r->read_end1);
827  if (word == 0) {
828  vP = qP_byte(read_reverse, prof->mat, r->read_end1 + 1, prof->n, prof->bias);
829  bests_reverse = sw_sse2_byte(ref, 1, r->ref_end1 + 1, r->read_end1 + 1, weight_gapO, weight_gapE, vP, r->score1, prof->bias, maskLen);
830  } else {
831  vP = qP_word(read_reverse, prof->mat, r->read_end1 + 1, prof->n);
832  bests_reverse = sw_sse2_word(ref, 1, r->ref_end1 + 1, r->read_end1 + 1, weight_gapO, weight_gapE, vP, r->score1, maskLen);
833  }
834  free(vP);
835  free(read_reverse);
836  r->ref_begin1 = bests_reverse[0].ref;
837  r->read_begin1 = r->read_end1 - bests_reverse[0].read;
838  free(bests_reverse);
839  if ((7&flag) == 0 || ((2&flag) != 0 && r->score1 < filters) || ((4&flag) != 0 && (r->ref_end1 - r->ref_begin1 > filterd || r->read_end1 - r->read_begin1 > filterd))) goto end;
840 
841  // Generate cigar.
842  refLen = r->ref_end1 - r->ref_begin1 + 1;
843  readLen = r->read_end1 - r->read_begin1 + 1;
844  band_width = abs(refLen - readLen) + 1;
845  path = banded_sw(ref + r->ref_begin1, prof->read + r->read_begin1, refLen, readLen, r->score1, weight_gapO, weight_gapE, band_width, prof->mat, prof->n);
846  if (path == 0) r = 0;
847  else {
848  r->cigar = path->seq;
849  r->cigarLen = path->length;
850  free(path);
851  }
852 
853 end:
854  return r;
855 }
856 
858  free(a->cigar);
859  free(a);
860 }