3 #if defined( __CUDACC__ )
7 __device__ m128i mm_cached_load_si128 (m128i *p)
11 __device__ m128i mm_load_si128 (m128i *p)
15 __device__
void mm_store_si128 (m128i *p, m128i a)
19 __device__ m128i mm_set_epi8 (
char b15,
char b14,
char b13,
char b12,
char b11,
char b10,
char b9,
char b8,
char b7,
char b6,
char b5,
char b4,
char b3,
char b2,
char b1,
char b0)
22 unsigned int(
unsigned char(b0)+128) |
23 unsigned int(
unsigned char(b1)+128) << 8 |
24 unsigned int(
unsigned char(b2)+128) << 16 |
25 unsigned int(
unsigned char(b3)+128) << 24,
27 unsigned int(
unsigned char(b4)+128) |
28 unsigned int(
unsigned char(b5)+128) << 8 |
29 unsigned int(
unsigned char(b6)+128) << 16 |
30 unsigned int(
unsigned char(b7)+128) << 24,
32 unsigned int(
unsigned char(b8)+128) |
33 unsigned int(
unsigned char(b9)+128) << 8 |
34 unsigned int(
unsigned char(b10)+128) << 16 |
35 unsigned int(
unsigned char(b11)+128) << 24,
37 unsigned int(
unsigned char(b12)+128) |
38 unsigned int(
unsigned char(b13)+128) << 8 |
39 unsigned int(
unsigned char(b14)+128) << 16 |
40 unsigned int(
unsigned char(b15)+128) << 24 );
42 __device__ m128i mm_set1_epi32 (
int i)
44 return make_uint4( i, i, i, i );
46 __device__ m128i mm_set1_epi8 (
char b)
48 const unsigned int ub =
unsigned int(
unsigned char(b)+128);
49 const unsigned int uw = ub | ub << 8 | ub << 16 | ub << 24;
50 return make_uint4( uw, uw, uw, uw );
52 __device__ m128i mm_adds_epu8 (m128i a, m128i b)
58 vaddus4( a.w, b.w ) );
60 __device__ m128i mm_subs_epu8 (m128i a, m128i b)
66 vsubus4( a.w, b.w ) );
69 __device__ m128i mm_max_epu8 (m128i a, m128i b)
77 __device__ m128i mm_min_epu8 (m128i a, m128i b)
86 __device__ m128i mm_cmpeq_epi8 (m128i a, m128i b)
92 vcmpeq4( a.w, b.w ) );
95 __device__
int mm_movemask_epi8 (m128i a)
97 return ((a.x & (1u << 31)) >> 16) |
98 ((a.x & (1u << 23)) >> 9) |
99 ((a.x & (1u << 15)) >> 2) |
100 ((a.x & (1u << 7)) << 5) |
101 ((a.y & (1u << 31)) >> 20) |
102 ((a.y & (1u << 23)) >> 13) |
103 ((a.y & (1u << 15)) >> 6) |
104 ((a.y & (1u << 7)) << 1) |
105 ((a.z & (1u << 31)) >> 24) |
106 ((a.z & (1u << 23)) >> 17) |
107 ((a.z & (1u << 15)) >> 10) |
108 ((a.z & (1u << 7)) >> 3) |
109 ((a.w & (1u << 31)) >> 28) |
110 ((a.w & (1u << 23)) >> 21) |
111 ((a.w & (1u << 15)) >> 14) |
112 ((a.w & (1u << 7)) >> 7);
115 template <
unsigned int imm>
116 struct mm_slli_si128_dispatcher {};
119 struct mm_slli_si128_dispatcher<1>
121 __device__
static m128i dispatch(
const m128i a)
125 (a.y << 8) | (a.x >> 24),
126 (a.z << 8) | (a.y >> 24),
127 (a.w << 8) | (a.z >> 24) );
132 struct mm_slli_si128_dispatcher<2>
134 __device__
static m128i dispatch(
const m128i a)
138 (a.y << 16) | (a.x >> 16),
139 (a.z << 16) | (a.y >> 16),
140 (a.w << 16) | (a.z >> 16) );
145 struct mm_slli_si128_dispatcher<3>
147 __device__
static m128i dispatch(
const m128i a)
151 (a.y << 24) | (a.x >> 8),
152 (a.z << 24) | (a.y >> 8),
153 (a.w << 24) | (a.z >> 8) );
158 struct mm_slli_si128_dispatcher<4>
160 __device__
static m128i dispatch(
const m128i a)
171 struct mm_slli_si128_dispatcher<8>
173 __device__
static m128i dispatch(
const m128i a)
184 struct mm_slli_si128_dispatcher<12>
186 __device__
static m128i dispatch(
const m128i a)
196 #define mm_slli_si128( a, imm ) mm_slli_si128_dispatcher<imm>::dispatch( a )
198 template <
unsigned int imm>
199 struct mm_srli_si128_dispatcher {};
202 struct mm_srli_si128_dispatcher<1>
204 __device__
static m128i dispatch(
const m128i a)
207 (a.x >> 8) | (a.y & 255u),
208 (a.y >> 8) | (a.z & 255u),
209 (a.z >> 8) | (a.w & 255u),
215 struct mm_srli_si128_dispatcher<2>
217 __device__
static m128i dispatch(
const m128i a)
220 (a.x >> 16) | (a.y & 0x0000FFFFu),
221 (a.y >> 16) | (a.z & 0x0000FFFFu),
222 (a.z >> 16) | (a.w & 0x0000FFFFu),
228 struct mm_srli_si128_dispatcher<3>
230 __device__
static m128i dispatch(
const m128i a)
233 (a.x >> 24) | (a.y & 0x00FFFFFFu),
234 (a.y >> 24) | (a.z & 0x00FFFFFFu),
235 (a.z >> 24) | (a.w & 0x00FFFFFFu),
241 struct mm_srli_si128_dispatcher<4>
243 __device__
static m128i dispatch(
const m128i a)
254 struct mm_srli_si128_dispatcher<8>
256 __device__
static m128i dispatch(
const m128i a)
267 struct mm_srli_si128_dispatcher<12>
269 __device__
static m128i dispatch(
const m128i a)
279 #define mm_srli_si128( a, imm ) mm_srli_si128_dispatcher<imm>::dispatch( a )
281 template <
unsigned int imm>
struct mm_extract_epi16_dispatcher {};
282 template <>
struct mm_extract_epi16_dispatcher<0> { __device__
static int dispatch(
const m128i a) {
return a.w >> 16; } };
283 template <>
struct mm_extract_epi16_dispatcher<1> { __device__
static int dispatch(
const m128i a) {
return a.w & 0x0000FFFF; } };
284 template <>
struct mm_extract_epi16_dispatcher<2> { __device__
static int dispatch(
const m128i a) {
return a.z >> 16; } };
285 template <>
struct mm_extract_epi16_dispatcher<3> { __device__
static int dispatch(
const m128i a) {
return a.z & 0x0000FFFF; } };
286 template <>
struct mm_extract_epi16_dispatcher<4> { __device__
static int dispatch(
const m128i a) {
return a.y >> 16; } };
287 template <>
struct mm_extract_epi16_dispatcher<5> { __device__
static int dispatch(
const m128i a) {
return a.y & 0x0000FFFF; } };
288 template <>
struct mm_extract_epi16_dispatcher<6> { __device__
static int dispatch(
const m128i a) {
return a.x >> 16; } };
289 template <>
struct mm_extract_epi16_dispatcher<7> { __device__
static int dispatch(
const m128i a) {
return a.x & 0x0000FFFF; } };
291 #define mm_extract_epi16( a, imm ) mm_extract_epi16_dispatcher<imm>::dispatch( a )
293 __global__
void sse_test_kernel()
297 zero = mm_set1_epi32(0);
298 a = mm_set1_epi8(0xFF);
299 b = mm_cmpeq_epi8( zero, zero );
303 a.w != b.w) printf(
"error\n");
305 b = mm_set_epi8( 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 );
307 int h0 = mm_extract_epi16( b, 0 );