38 #if defined(PLATFORM_X86)
43 #if defined(SSE_LOADS)
44 #include <emmintrin.h>
45 #include <smmintrin.h>
55 buckets.resize( n_buckets );
57 #pragma omp parallel for
72 const uint64 odd = ((c&2)? x : ~x) >> 1;
73 const uint64 even = ((c&1)? x : ~x);
74 const uint64 mask = odd & even & 0x5555555555555555U;
89 return (c == 0) ? r - i : r;
92 #if defined(SSE_LOADS)
100 #if defined(SSE_MATH)
101 const __m128i ones = _mm_set_epi64x( (
int64)0xFFFFFFFFFFFFFFFFull, (
int64)0xFFFFFFFFFFFFFFFFull );
102 const __m128i fives = _mm_set_epi64x( (
int64)0x5555555555555555ull, (
int64)0x5555555555555555ull );
103 const __m128i odd = _mm_srli_epi64( ((c&2)? x : _mm_subs_epu8( ones, x )), 1 );
104 const __m128i even = ((c&1)? x : _mm_subs_epu8( ones, x ));
105 const __m128i mask = _mm_and_si128( _mm_and_si128( odd, even ), fives );
106 return popc( (
uint64)_mm_extract_epi64( mask, 0 ) ) +
109 return popc_2bit<c>( (
uint64)_mm_extract_epi64(x,0) ) +
110 popc_2bit<c>( (
uint64)_mm_extract_epi64(x,1) );
120 const __m128i m = _mm_set_epi64x(
121 (
int64)(w == 0 ? u : 0xFFFFFFFFFFFFFFFFull),
122 (
int64)(w == 1 ? u : 0x0000000000000000ull) );
124 return _mm_and_si128( mask, m );
133 #if 0 && defined(SSE_MATH)
138 return (c == 0) ? r - i - w*32u : r;
141 return popc_2bit<c>( (
uint64)_mm_extract_epi64(mask,0), i );
143 return popc_2bit<c>( (
uint64)_mm_extract_epi64(mask,0) ) +
144 popc_2bit<c>( (
uint64)_mm_extract_epi64(mask,1), i );
152 #if !defined(SSE_LOADS)
156 for (
uint32 j = 0; j < n; ++j)
162 const __m128i* page_mm =
reinterpret_cast<const __m128i*
>( page );
169 for (
uint32 j = 0; j < n/2; ++j)
170 out += popc_2bit<0>( _mm_load_si128( page_mm + j ) );
172 out += popc_2bit<0>( _mm_load_si128( page_mm + n/2 ), n & 1,
mod );
177 for (
uint32 j = 0; j < n/2; ++j)
178 out += popc_2bit<1>( _mm_load_si128( page_mm + j ) );
180 out += popc_2bit<1>( _mm_load_si128( page_mm + n/2 ), n & 1,
mod );
185 for (
uint32 j = 0; j < n/2; ++j)
186 out += popc_2bit<2>( _mm_load_si128( page_mm + j ) );
188 out += popc_2bit<2>( _mm_load_si128( page_mm + n/2 ), n & 1,
mod );
193 for (
uint32 j = 0; j < n/2; ++j)
194 out += popc_2bit<3>( _mm_load_si128( page_mm + j ) );
196 out += popc_2bit<3>( _mm_load_si128( page_mm + n/2 ), n & 1,
mod );
204 m_pos.reserve( n_special );
207 m_id.reserve( n_special );
217 m_pos.resize( n_special );
218 m_id.resize( n_special );
252 #pragma omp parallel for
260 merge_by_key<host_tag>(
274 m_n_special += n_special;