nvbio/cpu_8cpp_source.html

#include <nvbio/basic/types.h>

#include <nvbio/basic/numbers.h>

#include <nvbio/basic/timer.h>

#include <emmintrin.h>

#include <stdio.h>

#include <stdlib.h>

#include <vector>


using namespace nvbio;


typedef uint8  uint8_t;

typedef uint16 uint16_t;

typedef uint32 uint32_t;

typedef uint64 uint64_t;

typedef int8   int8_t;

typedef int16  int16_t;

typedef int32  int32_t;

typedef int64  int64_t;


#define KSW_XBYTE  0x10000

#define KSW_XSTOP  0x20000

#define KSW_XSUBO  0x40000

#define KSW_XSTART 0x80000


typedef struct {

        int score; // best score

        int te, qe; // target end and query end

        int score2, te2; // second best score and ending position on the target

        int tb, qb; // target start and query start

} kswr_t;


struct kswq_t {

        int qlen, slen;

        uint8_t shift, mdiff, max, size;

    __m128i *qp, *H0, *H1, *E, *Hmax;

};


#ifdef __GNUC__

#define LIKELY(x) __builtin_expect((x),1)

#define UNLIKELY(x) __builtin_expect((x),0)

#else

#define LIKELY(x) (x)

#define UNLIKELY(x) (x)

#endif


__device__ int __max_16(__m128i xx)

{

    xx = _mm_max_epu8(xx, _mm_srli_si128(xx, 8));

    xx = _mm_max_epu8(xx, _mm_srli_si128(xx, 4));

    xx = _mm_max_epu8(xx, _mm_srli_si128(xx, 2));

    xx = _mm_max_epu8(xx, _mm_srli_si128(xx, 1));

    return _mm_extract_epi16((xx), 0) & 0x00ff;

}


kswq_t* ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat)

{

        kswq_t *q;

        int slen, a, tmp, p;


        size = size > 1? 2 : 1;

        p = 8 * (3 - size); // # values per __m128i

        slen = (qlen + p - 1) / p; // segmented length

        q = (kswq_t*)malloc(sizeof(kswq_t) + 256 + 16 * slen * (m + 4)); // a single block of memory

        q->qp = (__m128i*)(((size_t)q + sizeof(kswq_t) + 15) >> 4 << 4); // align memory

        q->H0 = q->qp + slen * m;

        q->H1 = q->H0 + slen;

        q->E  = q->H1 + slen;

        q->Hmax = q->E + slen;

        q->slen = slen; q->qlen = qlen; q->size = size;

        // compute shift

        tmp = m * m;

        for (a = 0, q->shift = 127, q->mdiff = 0; a < tmp; ++a) { // find the minimum and maximum score

                if (mat[a] < (int8_t)q->shift) q->shift = mat[a];

                if (mat[a] > (int8_t)q->mdiff) q->mdiff = mat[a];

        }

        q->max = q->mdiff;

        q->shift = 256 - q->shift; // NB: q->shift is uint8_t

        q->mdiff += q->shift; // this is the difference between the min and max scores

        // An example: p=8, qlen=19, slen=3 and segmentation:

        //  {{0,3,6,9,12,15,18,-1},{1,4,7,10,13,16,-1,-1},{2,5,8,11,14,17,-1,-1}}

        if (size == 1) {

                int8_t *t = (int8_t*)q->qp;

                for (a = 0; a < m; ++a) {

                        int i, k, nlen = slen * p;

                        const int8_t *ma = mat + a * m;

                        for (i = 0; i < slen; ++i)

                                for (k = i; k < nlen; k += slen) // p iterations

                                        *t++ = (k >= qlen? 0 : ma[query[k]]) + q->shift;

                }

        } else {

                int16_t *t = (int16_t*)q->qp;

                for (a = 0; a < m; ++a) {

                        int i, k, nlen = slen * p;

                        const int8_t *ma = mat + a * m;

                        for (i = 0; i < slen; ++i)

                                for (k = i; k < nlen; k += slen) // p iterations

                                        *t++ = (k >= qlen? 0 : ma[query[k]]);

                }

        }

        return q;

}


kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, int xtra) // the first gap costs -(_o+_e)

{

    const kswr_t g_defr = { 0, -1, -1, -1, -1, -1, -1 };


    int slen, i, n_b, te = -1, gmax = 0, minsc, endsc;

    uint64_t b[256];

    __m128i zero, gapoe, gape, shift, *H0, *H1, *E, *Hmax;

    kswr_t r;


    // keep hot arrays in local memory

    __m128i lmem_H0[256/16];

    __m128i lmem_H1[256/16];

    __m128i lmem_E[256/16];

    __m128i lmem_Hmax[256/16];


    // initialization

    r = g_defr;

    minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000;

    endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000;

    n_b = 0;

    zero = _mm_set1_epi32(0);

    gapoe = _mm_set1_epi8(_gapo + _gape);

    gape = _mm_set1_epi8(_gape);

    shift = _mm_set1_epi8(q->shift);

    //H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax;

    H0 = lmem_H0; H1 = lmem_H1; E = lmem_E; Hmax = lmem_Hmax;

    slen = q->slen;

    for (i = 0; i < slen; ++i) {

        _mm_store_si128(E + i, zero);

        _mm_store_si128(H0 + i, zero);

        _mm_store_si128(Hmax + i, zero);

    }


    // copy the query-profile to local memory

    __m128i lmem_qp[4*256/16];

    for (int i = 0; i < slen*4; ++i)

        lmem_qp[i] = q->qp[i];


    // the core loop

    for (i = 0; i < tlen; ++i) {

        int j, k, cmp, imax;

        __m128i e, h, f = zero, max = zero;

        //__m128i* S = q->qp + target[i] * slen; // s is the 1st score vector

        __m128i* S = lmem_qp + target[i] * slen; // s is the 1st score vector

        h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example

        h = _mm_slli_si128(h, 1); // h=H(i-1,-1); << instead of >> because x64 is little-endian


        for (j = 0; LIKELY(j < slen); ++j) {

            /* SW cells are computed in the following order:

             *   H(i,j)   = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)}

             *   E(i+1,j) = max{H(i,j)-q, E(i,j)-r}

             *   F(i,j+1) = max{H(i,j)-q, F(i,j)-r}

             */

            // compute H'(i,j); note that at the beginning, h=H'(i-1,j-1)

            //h = _mm_adds_epu8(h, _mm_cached_load_si128(S + j));

            h = _mm_adds_epu8(h, _mm_load_si128(S + j));


            h = _mm_subs_epu8(h, shift); // h=H'(i-1,j-1)+S(i,j)

            e = _mm_load_si128(E + j); // e=E'(i,j)

            h = _mm_max_epu8(h, e);

            h = _mm_max_epu8(h, f); // h=H'(i,j)

            max = _mm_max_epu8(max, h); // set max

            _mm_store_si128(H1 + j, h); // save to H'(i,j)

            // now compute E'(i+1,j)

            h = _mm_subs_epu8(h, gapoe); // h=H'(i,j)-gapo

            e = _mm_subs_epu8(e, gape); // e=E'(i,j)-gape

            e = _mm_max_epu8(e, h); // e=E'(i+1,j)

            _mm_store_si128(E + j, e); // save to E'(i+1,j)

            // now compute F'(i,j+1)

            f = _mm_subs_epu8(f, gape);

            f = _mm_max_epu8(f, h);

            // get H'(i-1,j) and prepare for the next j

            h = _mm_load_si128(H0 + j); // h=H'(i-1,j)

        }


        // NB: we do not need to set E(i,j) as we disallow adjecent insertion and then deletion

        for (k = 0; LIKELY(k < 16); ++k) { // this block mimics SWPS3; NB: H(i,j) updated in the lazy-F loop cannot exceed max

            f = _mm_slli_si128(f, 1);

            for (j = 0; LIKELY(j < slen); ++j) {

                h = _mm_load_si128(H1 + j);

                h = _mm_max_epu8(h, f); // h=H'(i,j)

                _mm_store_si128(H1 + j, h);

                h = _mm_subs_epu8(h, gapoe);

                f = _mm_subs_epu8(f, gape);

                cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_subs_epu8(f, h), zero));

                if (UNLIKELY(cmp == 0xffff)) break;

            }

            if (UNLIKELY(cmp == 0xffff)) break;

        }


        //int k;for (k=0;k<16;++k)printf("%d ", ((uint8_t*)&max)[k]);printf("\n");

        imax = __max_16(max); // imax is the maximum number in max

        if (imax >= minsc) { // write the b array; this condition adds branching unfornately

            if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { // then append

                b[n_b++] = (uint64_t)imax<<32 | i;

            } else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last

        }

        if (imax > gmax) {

            gmax = imax; te = i; // te is the end position on the target

            for (j = 0; LIKELY(j < slen); ++j) // keep the H1 vector

                _mm_store_si128(Hmax + j, _mm_load_si128(H1 + j));

            if (gmax + q->shift >= 255 || gmax >= endsc) break;

        }

        S = H1; H1 = H0; H0 = S; // swap H0 and H1

    }

    r.score = gmax + q->shift < 255? gmax : 255;

    r.te = te;

    if (r.score != 255) { // get a->qe, the end of query match; find the 2nd best score

        int max = -1, tmp, low, high, qlen = slen * 16;

        uint8_t *t = (uint8_t*)Hmax;

        for (i = 0; i < qlen; ++i, ++t)

            if ((int)*t > max) max = *t, r.qe = i / 16 + i % 16 * slen;

            else if ((int)*t == max && (tmp = i / 16 + i % 16 * slen) < r.qe) r.qe = tmp;

        //printf("%d,%d\n", max, gmax);

        if (b) {

            i = (r.score + q->max - 1) / q->max;

            low = te - i; high = te + i;

            for (i = 0; i < n_b; ++i) {

                int e = (int32_t)b[i];

                if ((e < low || e > high) && (int)(b[i]>>32) > r.score2)

                    r.score2 = b[i]>>32, r.te2 = e;

            }

        }

    }

    return r;

}


void cpu_test()

{

    unsigned int abcd[4];


    __m128i b = _mm_set_epi8( 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 );


    _mm_store_si128( (__m128i*)abcd, b );

    printf("w[0:3]: %X %X %X %X\n", abcd[0], abcd[1], abcd[2], abcd[3]);


    const int h0 = _mm_extract_epi16( b, 0 );

    printf("h0: %X\n", h0);


    b = _mm_slli_si128( b, 1 );

    _mm_store_si128( (__m128i*)abcd, b );

    printf("w[0:3]: %X %X %X %X\n", abcd[0], abcd[1], abcd[2], abcd[3]);


    b = _mm_set_epi8( 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 );

    b = _mm_slli_si128( b, 4 );

    _mm_store_si128( (__m128i*)abcd, b );

    printf("w[0:3]: %X %X %X %X\n", abcd[0], abcd[1], abcd[2], abcd[3]);


    b = _mm_set_epi8( 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 );

    b = _mm_srli_si128( b, 1 );

    _mm_store_si128( (__m128i*)abcd, b );

    printf("w[0:3]: %X %X %X %X\n", abcd[0], abcd[1], abcd[2], abcd[3]);

}


void cpu_ksw_test()

{

    const uint32_t N    = 100*1000;

    const uint32   qlen = 150;

    const uint32   tlen = 200;

    const uint32   m    = 4;


    std::vector<kswq_t*> qp( N );

    std::vector<kswr_t>  r( N );

    std::vector<uint8_t> query( qlen * N, uint8_t(0u) );

    std::vector<uint8_t> target( tlen * N, uint8_t(0u) );

    std::vector<int8_t>  mat( m * m, -4 );


    mat[0    ] = 1;

    mat[1*m+1] = 1;

    mat[2*m+2] = 1;

    mat[3*m+3] = 1;


    const uint32 block_size = 128;

    const uint32 n_blocks   = util::divide_ri( N, 128 );


    nvbio::Timer timer;

    timer.start();


    {

        for (uint32_t i = 0; i < N; ++i)

            qp[i] = ksw_qinit(1, qlen, &query[i*qlen], m, &mat[0] );


        for (uint32 j = 0; j < 10; ++j)

        {

            for (uint32_t i = 0; i < N; ++i)

                r[i] = ksw_u8( qp[i], tlen, &target[i*tlen], -1, -1, -2 );

        }

    }


    fprintf(stderr, "cpu ksw rate: %.1f M/s (%.1f GCUPS)\n", 1.0e-6f * float(10 * N)/timer.seconds(), 1.0e-9f * float(qlen * tlen) * (float(10 * N)/timer.seconds()));

}