NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
sse_cuda.h
Go to the documentation of this file.
1 #include "simd_functions.h"
2 
3 #if defined( __CUDACC__ )
4 
5 typedef uint4 m128i;
6 
7 __device__ m128i mm_cached_load_si128 (m128i *p)
8 {
9  return __ldg(p);
10 }
11 __device__ m128i mm_load_si128 (m128i *p)
12 {
13  return *p;
14 }
15 __device__ void mm_store_si128 (m128i *p, m128i a)
16 {
17  *p = a;
18 }
19 __device__ m128i mm_set_epi8 (char b15, char b14, char b13, char b12,char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)
20 {
21  return make_uint4(
22  unsigned int(unsigned char(b0)+128) |
23  unsigned int(unsigned char(b1)+128) << 8 |
24  unsigned int(unsigned char(b2)+128) << 16 |
25  unsigned int(unsigned char(b3)+128) << 24,
26 
27  unsigned int(unsigned char(b4)+128) |
28  unsigned int(unsigned char(b5)+128) << 8 |
29  unsigned int(unsigned char(b6)+128) << 16 |
30  unsigned int(unsigned char(b7)+128) << 24,
31 
32  unsigned int(unsigned char(b8)+128) |
33  unsigned int(unsigned char(b9)+128) << 8 |
34  unsigned int(unsigned char(b10)+128) << 16 |
35  unsigned int(unsigned char(b11)+128) << 24,
36 
37  unsigned int(unsigned char(b12)+128) |
38  unsigned int(unsigned char(b13)+128) << 8 |
39  unsigned int(unsigned char(b14)+128) << 16 |
40  unsigned int(unsigned char(b15)+128) << 24 );
41 }
42 __device__ m128i mm_set1_epi32 (int i)
43 {
44  return make_uint4( i, i, i, i );
45 }
46 __device__ m128i mm_set1_epi8 (char b)
47 {
48  const unsigned int ub = unsigned int(unsigned char(b)+128);
49  const unsigned int uw = ub | ub << 8 | ub << 16 | ub << 24;
50  return make_uint4( uw, uw, uw, uw );
51 }
52 __device__ m128i mm_adds_epu8 (m128i a, m128i b)
53 {
54  return make_uint4(
55  vaddus4( a.x, b.x ),
56  vaddus4( a.y, b.y ),
57  vaddus4( a.z, b.z ),
58  vaddus4( a.w, b.w ) );
59 }
60 __device__ m128i mm_subs_epu8 (m128i a, m128i b)
61 {
62  return make_uint4(
63  vsubus4( a.x, b.x ),
64  vsubus4( a.y, b.y ),
65  vsubus4( a.z, b.z ),
66  vsubus4( a.w, b.w ) );
67 }
68 
69 __device__ m128i mm_max_epu8 (m128i a, m128i b)
70 {
71  return make_uint4(
72  vmaxu4( a.x, b.x ),
73  vmaxu4( a.y, b.y ),
74  vmaxu4( a.z, b.z ),
75  vmaxu4( a.w, b.w ) );
76 }
77 __device__ m128i mm_min_epu8 (m128i a, m128i b)
78 {
79  return make_uint4(
80  vminu4( a.x, b.x ),
81  vminu4( a.y, b.y ),
82  vminu4( a.z, b.z ),
83  vminu4( a.w, b.w ) );
84 }
85 
86 __device__ m128i mm_cmpeq_epi8 (m128i a, m128i b)
87 {
88  return make_uint4(
89  vcmpeq4( a.x, b.x ),
90  vcmpeq4( a.y, b.y ),
91  vcmpeq4( a.z, b.z ),
92  vcmpeq4( a.w, b.w ) );
93 }
94 
95 __device__ int mm_movemask_epi8 (m128i a)
96 {
97  return ((a.x & (1u << 31)) >> 16) |
98  ((a.x & (1u << 23)) >> 9) |
99  ((a.x & (1u << 15)) >> 2) |
100  ((a.x & (1u << 7)) << 5) |
101  ((a.y & (1u << 31)) >> 20) |
102  ((a.y & (1u << 23)) >> 13) |
103  ((a.y & (1u << 15)) >> 6) |
104  ((a.y & (1u << 7)) << 1) |
105  ((a.z & (1u << 31)) >> 24) |
106  ((a.z & (1u << 23)) >> 17) |
107  ((a.z & (1u << 15)) >> 10) |
108  ((a.z & (1u << 7)) >> 3) |
109  ((a.w & (1u << 31)) >> 28) |
110  ((a.w & (1u << 23)) >> 21) |
111  ((a.w & (1u << 15)) >> 14) |
112  ((a.w & (1u << 7)) >> 7);
113 }
114 
115 template <unsigned int imm>
116 struct mm_slli_si128_dispatcher {};
117 
118 template <>
119 struct mm_slli_si128_dispatcher<1>
120 {
121  __device__ static m128i dispatch(const m128i a)
122  {
123  return make_uint4(
124  (a.x << 8),
125  (a.y << 8) | (a.x >> 24),
126  (a.z << 8) | (a.y >> 24),
127  (a.w << 8) | (a.z >> 24) );
128  }
129 };
130 
131 template <>
132 struct mm_slli_si128_dispatcher<2>
133 {
134  __device__ static m128i dispatch(const m128i a)
135  {
136  return make_uint4(
137  (a.x << 16),
138  (a.y << 16) | (a.x >> 16),
139  (a.z << 16) | (a.y >> 16),
140  (a.w << 16) | (a.z >> 16) );
141  }
142 };
143 
144 template <>
145 struct mm_slli_si128_dispatcher<3>
146 {
147  __device__ static m128i dispatch(const m128i a)
148  {
149  return make_uint4(
150  (a.x << 24),
151  (a.y << 24) | (a.x >> 8),
152  (a.z << 24) | (a.y >> 8),
153  (a.w << 24) | (a.z >> 8) );
154  }
155 };
156 
157 template <>
158 struct mm_slli_si128_dispatcher<4>
159 {
160  __device__ static m128i dispatch(const m128i a)
161  {
162  return make_uint4(
163  0,
164  a.x,
165  a.y,
166  a.z );
167  }
168 };
169 
170 template <>
171 struct mm_slli_si128_dispatcher<8>
172 {
173  __device__ static m128i dispatch(const m128i a)
174  {
175  return make_uint4(
176  0,
177  0,
178  a.x,
179  a.y );
180  }
181 };
182 
183 template <>
184 struct mm_slli_si128_dispatcher<12>
185 {
186  __device__ static m128i dispatch(const m128i a)
187  {
188  return make_uint4(
189  0,
190  0,
191  0,
192  a.x );
193  }
194 };
195 
196 #define mm_slli_si128( a, imm ) mm_slli_si128_dispatcher<imm>::dispatch( a )
197 
198 template <unsigned int imm>
199 struct mm_srli_si128_dispatcher {};
200 
201 template <>
202 struct mm_srli_si128_dispatcher<1>
203 {
204  __device__ static m128i dispatch(const m128i a)
205  {
206  return make_uint4(
207  (a.x >> 8) | (a.y & 255u),
208  (a.y >> 8) | (a.z & 255u),
209  (a.z >> 8) | (a.w & 255u),
210  (a.w >> 8) );
211  }
212 };
213 
214 template <>
215 struct mm_srli_si128_dispatcher<2>
216 {
217  __device__ static m128i dispatch(const m128i a)
218  {
219  return make_uint4(
220  (a.x >> 16) | (a.y & 0x0000FFFFu),
221  (a.y >> 16) | (a.z & 0x0000FFFFu),
222  (a.z >> 16) | (a.w & 0x0000FFFFu),
223  (a.w >> 16) );
224  }
225 };
226 
227 template <>
228 struct mm_srli_si128_dispatcher<3>
229 {
230  __device__ static m128i dispatch(const m128i a)
231  {
232  return make_uint4(
233  (a.x >> 24) | (a.y & 0x00FFFFFFu),
234  (a.y >> 24) | (a.z & 0x00FFFFFFu),
235  (a.z >> 24) | (a.w & 0x00FFFFFFu),
236  (a.w >> 24) );
237  }
238 };
239 
240 template <>
241 struct mm_srli_si128_dispatcher<4>
242 {
243  __device__ static m128i dispatch(const m128i a)
244  {
245  return make_uint4(
246  a.y,
247  a.z,
248  a.w,
249  0 );
250  }
251 };
252 
253 template <>
254 struct mm_srli_si128_dispatcher<8>
255 {
256  __device__ static m128i dispatch(const m128i a)
257  {
258  return make_uint4(
259  a.z,
260  a.w,
261  0,
262  0 );
263  }
264 };
265 
266 template <>
267 struct mm_srli_si128_dispatcher<12>
268 {
269  __device__ static m128i dispatch(const m128i a)
270  {
271  return make_uint4(
272  a.w,
273  0,
274  0,
275  0 );
276  }
277 };
278 
279 #define mm_srli_si128( a, imm ) mm_srli_si128_dispatcher<imm>::dispatch( a )
280 
281 template <unsigned int imm> struct mm_extract_epi16_dispatcher {};
282 template <> struct mm_extract_epi16_dispatcher<0> { __device__ static int dispatch(const m128i a) { return a.w >> 16; } };
283 template <> struct mm_extract_epi16_dispatcher<1> { __device__ static int dispatch(const m128i a) { return a.w & 0x0000FFFF; } };
284 template <> struct mm_extract_epi16_dispatcher<2> { __device__ static int dispatch(const m128i a) { return a.z >> 16; } };
285 template <> struct mm_extract_epi16_dispatcher<3> { __device__ static int dispatch(const m128i a) { return a.z & 0x0000FFFF; } };
286 template <> struct mm_extract_epi16_dispatcher<4> { __device__ static int dispatch(const m128i a) { return a.y >> 16; } };
287 template <> struct mm_extract_epi16_dispatcher<5> { __device__ static int dispatch(const m128i a) { return a.y & 0x0000FFFF; } };
288 template <> struct mm_extract_epi16_dispatcher<6> { __device__ static int dispatch(const m128i a) { return a.x >> 16; } };
289 template <> struct mm_extract_epi16_dispatcher<7> { __device__ static int dispatch(const m128i a) { return a.x & 0x0000FFFF; } };
290 
291 #define mm_extract_epi16( a, imm ) mm_extract_epi16_dispatcher<imm>::dispatch( a )
292 
293 __global__ void sse_test_kernel()
294 {
295  m128i zero, a, b;
296 
297  zero = mm_set1_epi32(0);
298  a = mm_set1_epi8(0xFF);
299  b = mm_cmpeq_epi8( zero, zero );
300  if (a.x != b.x ||
301  a.y != b.y ||
302  a.z != b.z ||
303  a.w != b.w) printf("error\n");
304 
305  b = mm_set_epi8( 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 );
306 
307  int h0 = mm_extract_epi16( b, 0 );
308 }
309 
310 #endif // __CUDACC__