NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
simd_functions.h
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #if !defined (SIMD_FUNCTIONS_H__)
29 #define SIMD_FUNCTIONS_H__
30 
31 /*
32  This header file contains inline functions that implement intra-word SIMD
33  operations, that are hardware accelerated on sm_3x (Kepler) GPUs. Efficient
34  emulation code paths are provided for earlier architectures (sm_1x, sm_2x)
35  to make the code portable across all GPUs supported by CUDA. The following
36  functions are currently implemented:
37 
38  vabsdiffu2(a,b) per-halfword unsigned absolute difference: |a - b|
39  vadd2(a,b) per-halfword (un)signed addition, with wrap-around: a + b
40  vaddss2(a,b) per-halfword addition with signed saturation: sat.s16 (a + b)
41  vaddus2(a,b) per-halfword addition with unsigned saturation: sat.u16 (a+b)
42  vavgs2(a,b) per-halfword signed rounded average: (a+b+((a+b)>=0)) >> 1
43  vavgu2(a,b) per-halfword unsigned rounded average: (a + b + 1) / 2
44  vcmpeq2(a,b) per-halfword (un)signed comparison: a == b ? 0xffff : 0
45  vcmpgeu2(a,b) per-halfword unsigned comparison: a >= b ? 0xffff : 0
46  vcmpgtu2(a,b) per-halfword unsigned comparison: a > b ? 0xffff : 0
47  vcmpleu2(a,b) per-halfword unsigned comparison: a <= b ? 0xffff : 0
48  vcmpltu2(a,b) per-halfword unsigned comparison: a < b ? 0xffff : 0
49  vcmpne2(a,b) per-halfword (un)signed comparison: a != b ? 0xffff : 0
50  vhaddu2(a,b) per-halfword unsigned average: (a + b) / 2
51  vmaxu2(a,b) per-halfword unsigned maximum: max(a, b)
52  vminu2(a,b) per-halfword unsigned minimum: min(a, b)
53  vseteq2(a,b) per-halfword (un)signed comparison: a == b ? 1 : 0
54  vsetgeu2(a,b) per-halfword unsigned comparison: a >= b ? 1 : 0
55  vsetgtu2(a,b) per-halfword unsigned comparison: a > b ? 1 : 0
56  vsetleu2(a,b) per-halfword unsigned comparison: a <= b ? 1 : 0
57  vsetltu2(a,b) per-halfword unsigned comparison: a < b ? 1 : 0
58  vsetne2(a,b) per-halfword (un)signed comparison: a != b ? 1 : 0
59  vsub2(a,b) per-halfword (un)signed subtraction, with wrap-around: a - b
60 
61  vabsdiffu4(a,b) per-byte unsigned absolute difference: |a - b|
62  vadd4(a,b) per-byte (un)signed addition, with wrap-around: a + b
63  vaddss4(a,b) per-byte addition with signed saturation: sat.s8 (a + b)
64  vaddus4(a,b) per-byte addition with unsigned saturation: sat.u8 (a + b)
65  vavgs4(a,b) per-byte signed rounded average: (a + b + ((a+b) >= 0)) >> 1
66  vavgu4(a,b) per-byte unsigned rounded average: (a + b + 1) / 2
67  vcmpeq4(a,b) per-byte (un)signed comparison: a == b ? 0xff : 0
68  vcmpgeu4(a,b) per-byte unsigned comparison: a >= b ? 0xff : 0
69  vcmpgtu4(a,b) per-byte unsigned comparison: a > b ? 0xff : 0
70  vcmpleu4(a,b) per-byte unsigned comparison: a <= b ? 0xff : 0
71  vcmpltu4(a,b) per-byte unsigned comparison: a < b ? 0xff : 0
72  vcmpne4(a,b) per-byte (un)signed comparison: a != b ? 0xff: 0
73  vhaddu4(a,b) per-byte unsigned average: (a + b) / 2
74  vmaxu4(a,b) per-byte unsigned maximum: max(a, b)
75  vminu4(a,b) per-byte unsigned minimum: min(a, b)
76  vseteq4(a,b) per-byte (un)signed comparison: a == b ? 1 : 0
77  vsetgeu4(a,b) per-byte unsigned comparison: a >= b ? 1 : 0
78  vsetgtu4(a,b) per-byte unsigned comparison: a > b ? 1 : 0
79  vsetleu4(a,b) per-byte unsigned comparison: a <= b ? 1 : 0
80  vsetltu4(a,b) per-byte unsigned comparison: a < b ? 1 : 0
81  vsetne4(a,b) per-byte (un)signed comparison: a != b ? 1: 0
82  vsub4(a,b) per-byte (un)signed subtraction, with wrap-around: a - b
83 */
84 
85 static __device__ __forceinline__ unsigned int vadd2(unsigned int a, unsigned int b)
86 {
87  unsigned int s, t;
88 #if __CUDA_ARCH__ >= 300
89  s = 0;
90  asm ("vadd2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(t) : "r"(a), "r"(b), "r"(s));
91 #else /* __CUDA_ARCH__ >= 300 */
92  s = a ^ b; // sum bits
93  t = a + b; // actual sum
94  s = s ^ t; // determine carry-ins for each bit position
95  s = s & 0x00010000; // carry-in to high word (= carry-out from low word)
96  t = t - s; // subtract out carry-out from low word
97 #endif /* __CUDA_ARCH__ >= 300 */
98  return t; // halfword-wise sum
99 }
100 
101 static __device__ __forceinline__ unsigned int vaddss2 (unsigned int a, unsigned int b)
102 {
103  unsigned int r;
104 #if __CUDA_ARCH__ >= 300
105  unsigned int c = 0;
106  asm ("vadd2.s32.s32.s32.sat %0,%1,%2,%3;" : "=r"(r):"r"(a),"r"(b),"r"(c));
107 #else /* __CUDA_ARCH__ >= 300 */
108  int ahi, alo, blo, bhi, rhi, rlo;
109  ahi = (int)((a & 0xffff0000U));
110  bhi = (int)((b & 0xffff0000U));
111 #if __CUDA_ARCH__ < 200
112  alo = (int)(a << 16);
113  blo = (int)(b << 16);
114 #elif __CUDA_ARCH__ < 350
115  // work around (we would want left shifts at least for sm_2x)
116  asm ("prmt.b32 %0,%1,0,0x1044;" : "=r"(alo) : "r"(a));
117  asm ("prmt.b32 %0,%1,0,0x1044;" : "=r"(blo) : "r"(b));
118 #else
119  asm ("shf.l.clamp.b32 %0,0,%1,16;" : "=r"(alo) : "r"(a));
120  asm ("shf.l.clamp.b32 %0,0,%1,16;" : "=r"(blo) : "r"(b));
121 #endif
122  asm ("add.sat.s32 %0,%1,%2;" : "=r"(rlo) : "r"(alo), "r"(blo));
123  asm ("add.sat.s32 %0,%1,%2;" : "=r"(rhi) : "r"(ahi), "r"(bhi));
124 #if __CUDA_ARCH__ < 200
125  r = ((unsigned int)rhi & 0xffff0000U) | ((unsigned int)rlo >> 16);
126 #else
127  asm ("prmt.b32 %0,%1,%2,0x7632;" : "=r"(r) : "r"(rlo), "r"(rhi));
128 #endif /* __CUDA_ARCH__ < 200 */
129 #endif /* __CUDA_ARCH__ >= 300 */
130  return r;
131 }
132 
133 static __device__ __forceinline__ unsigned int vaddus2 (unsigned int a, unsigned int b)
134 {
135  unsigned int r;
136 #if __CUDA_ARCH__ >= 300
137  unsigned int c = 0;
138  asm ("vadd2.u32.u32.u32.sat %0,%1,%2,%3;" : "=r"(r):"r"(a),"r"(b),"r"(c));
139 #else /* __CUDA_ARCH__ >= 300 */
140  int alo, blo, rlo, ahi, bhi, rhi;
141  asm ("{ \n\t"
142  "and.b32 %0, %4, 0xffff; \n\t"
143  "and.b32 %1, %5, 0xffff; \n\t"
144 #if __CUDA_ARCH__ < 350
145  "shr.u32 %2, %4, 16; \n\t"
146  "shr.u32 %3, %5, 16; \n\t"
147 #else /* __CUDA_ARCH__ < 350 */
148  "shf.r.clamp.b32 %2, %4, 0, 16;\n\t"
149  "shf.r.clamp.b32 %3, %5, 0, 16;\n\t"
150 #endif /* __CUDA_ARCH__ < 350 */
151  "}"
152  : "=r"(alo), "=r"(blo), "=r"(ahi), "=r"(bhi)
153  : "r"(a), "r"(b));
154  rlo = ::min (alo + blo, 65535);
155  rhi = ::min (ahi + bhi, 65535);
156  r = (rhi << 16) + rlo;
157 #endif /* __CUDA_ARCH__ >= 300 */
158  return r;
159 }
160 
161 static __device__ __forceinline__ unsigned int vavgs2(unsigned int a, unsigned int b)
162 {
163  unsigned int r;
164 #if __CUDA_ARCH__ >= 300
165  unsigned int c = 0;
166  asm ("vavrg2.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
167 #else /* __CUDA_ARCH__ >= 300 */
168  // avgs (a + b) = ((a + b) < 0) ? ((a + b) >> 1) : ((a + b + 1) >> 1). The
169  // two expressions can be re-written as follows to avoid needing additional
170  // intermediate bits: ((a + b) >> 1) = (a >> 1) + (b >> 1) + ((a & b) & 1),
171  // ((a + b + 1) >> 1) = (a >> 1) + (b >> 1) + ((a | b) & 1). The difference
172  // between the two is ((a ^ b) & 1). Note that if (a + b) < 0, then also
173  // ((a + b) >> 1) < 0, since right shift rounds to negative infinity. This
174  // means we can compute ((a + b) >> 1) then conditionally add ((a ^ b) & 1)
175  // depending on the sign bit of the shifted sum. By handling the msb sum
176  // bit of the result separately, we avoid carry-out during summation and
177  // also can use (potentially faster) logical right shifts.
178  asm ("{ \n\t"
179  ".reg .u32 a,b,c,r,s,t,u,v;\n\t"
180  "mov.b32 a,%1; \n\t"
181  "mov.b32 b,%2; \n\t"
182  "and.b32 u,a,0xfffefffe;\n\t" // prevent shift crossing chunk boundary
183  "and.b32 v,b,0xfffefffe;\n\t" // prevent shift crossing chunk boundary
184  "xor.b32 s,a,b; \n\t" // a ^ b
185  "and.b32 t,a,b; \n\t" // a & b
186  "shr.u32 u,u,1; \n\t" // a >> 1
187  "shr.u32 v,v,1; \n\t" // b >> 1
188  "and.b32 c,s,0x00010001;\n\t" // (a ^ b) & 1
189  "and.b32 s,s,0x80008000;\n\t" // extract msb (a ^ b)
190  "and.b32 t,t,0x00010001;\n\t" // (a & b) & 1
191  "add.u32 r,u,v; \n\t" // (a>>1)+(b>>1)
192  "add.u32 r,r,t; \n\t" // (a>>1)+(b>>1)+(a&b&1); rec. msb cy-in
193  "xor.b32 r,r,s; \n\t" // compute masb sum bit: a ^ b ^ cy-in
194  "shr.u32 t,r,15; \n\t" // sign ((a + b) >> 1)
195  "not.b32 t,t; \n\t" // ~sign ((a + b) >> 1)
196  "and.b32 t,t,c; \n\t" // ((a ^ b) & 1) & ~sign ((a + b) >> 1)
197  "add.u32 r,r,t; \n\t" // conditionally add ((a ^ b) & 1)
198  "mov.b32 %0,r; \n\t"
199  "}"
200  : "=r"(r) : "r"(a), "r"(b));
201 #endif /* __CUDA_ARCH__ >= 300 */
202  return r; // halfword-wise signed average
203 }
204 
205 static __device__ __forceinline__ unsigned int vavgu2(unsigned int a, unsigned int b)
206 {
207  unsigned int r, c;
208 #if __CUDA_ARCH__ >= 300
209  c = 0;
210  asm ("vavrg2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
211 #else /* __CUDA_ARCH__ >= 300 */
212  // HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==>
213  // (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1)
214  c = a ^ b;
215  r = a | b;
216  c = c & 0xfffefffe; // ensure shift doesn't cross half-word boundaries
217  c = c >> 1;
218  r = r - c;
219 #endif /* __CUDA_ARCH__ >= 300 */
220  return r; // halfword-wise unsigned average
221 }
222 
223 static __device__ __forceinline__ unsigned int vhaddu2(unsigned int a, unsigned int b)
224 {
225  // HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==>
226  // (a + b) / 2 = (a & b) + ((a ^ b) >> 1)
227  unsigned int r, s;
228  s = a ^ b;
229  r = a & b;
230  s = s & 0xfffefffe; // ensure shift doesn't cross halfword boundaries
231  s = s >> 1;
232  r = r + s;
233  return r; // halfword-wise unsigned average [rounded down]
234 }
235 
236 static __device__ __forceinline__ unsigned int vcmpeq2(unsigned int a, unsigned int b)
237 {
238  unsigned int r, c;
239 #if __CUDA_ARCH__ >= 300
240  c = 0;
241  asm ("vset2.u32.u32.eq %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
242  c = r << 16; // convert bool
243  r = c - r; // into mask
244 #else /* __CUDA_ARCH__ >= 300 */
245  // inspired by Alan Mycroft's null-byte detection algorithm:
246  // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
247  r = a ^ b; // 0x0000 if a == b
248  c = r | 0x80008000; // set msbs, to catch carry out
249  r = r ^ c; // extract msbs, msb = 1 if r < 0x8000
250  c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
251  c = r & ~c; // msb = 1, if r was 0x0000
252  r = c >> 15; // convert
253  r = c - r; // msbs to
254  r = c | r; // mask
255 #endif /* __CUDA_ARCH__ >= 300 */
256  return r; // halfword-wise unsigned eq comparison, mask result
257 }
258 
259 static __device__ __forceinline__ unsigned int vcmpgeu2(unsigned int a, unsigned int b)
260 {
261  unsigned int r, c;
262 #if __CUDA_ARCH__ >= 300
263  c = 0;
264  asm ("vset2.u32.u32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
265  c = r << 16; // convert bool
266  r = c - r; // into mask
267 #else /* __CUDA_ARCH__ >= 300 */
268  asm ("not.b32 %0,%0;" : "+r"(b));
269  c = vavgu2 (a, b); // (a + ~b + 1) / 2 = (a - b) / 2
270  asm ("and.b32 %0,%0,0x80008000;" : "+r"(c)); // msb = carry-outs
271  asm ("shr.u32 %0,%1,15;" : "=r"(r) : "r"(c)); // build mask
272  asm ("sub.u32 %0,%1,%0;" : "+r"(r) : "r"(c)); // from
273  asm ("or.b32 %0,%1,%0;" : "+r"(r) : "r"(c)); // msbs
274 #endif /* __CUDA_ARCH__ >= 300 */
275  return r; // halfword-wise unsigned gt-eq comparison, mask result
276 }
277 
278 static __device__ __forceinline__ unsigned int vcmpgtu2(unsigned int a, unsigned int b)
279 {
280  unsigned int r, c;
281 #if __CUDA_ARCH__ >= 300
282  c = 0;
283  asm ("vset2.u32.u32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
284  c = r << 16; // convert bool
285  r = c - r; // into mask
286 #else /* __CUDA_ARCH__ >= 300 */
287  asm ("not.b32 %0,%0;" : "+r"(b));
288  c = vhaddu2 (a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
289  asm ("and.b32 %0,%0,0x80008000;" : "+r"(c)); // msb = carry-outs
290  asm ("shr.u32 %0,%1,15;" : "=r"(r) : "r"(c)); // build mask
291  asm ("sub.u32 %0,%1,%0;" : "+r"(r) : "r"(c)); // from
292  asm ("or.b32 %0,%1,%0;" : "+r"(r) : "r"(c)); // msbs
293 #endif /* __CUDA_ARCH__ >= 300 */
294  return r; // halfword-wise unsigned gt comparison, mask result
295 }
296 
297 static __device__ __forceinline__ unsigned int vcmpleu2(unsigned int a, unsigned int b)
298 {
299  unsigned int r, c;
300 #if __CUDA_ARCH__ >= 300
301  c = 0;
302  asm ("vset2.u32.u32.le %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
303  c = r << 16; // convert bool
304  r = c - r; // into mask
305 #else /* __CUDA_ARCH__ >= 300 */
306  asm ("not.b32 %0,%0;" : "+r"(a));
307  c = vavgu2 (a, b); // (b + ~a + 1) / 2 = (b - a) / 2
308  asm ("and.b32 %0,%0,0x80008000;" : "+r"(c)); // msb = carry-outs
309  asm ("shr.u32 %0,%1,15;" : "=r"(r) : "r"(c)); // build mask
310  asm ("sub.u32 %0,%1,%0;" : "+r"(r) : "r"(c)); // from
311  asm ("or.b32 %0,%1,%0;" : "+r"(r) : "r"(c)); // msbs
312 #endif /* __CUDA_ARCH__ >= 300 */
313  return r; // halfword-wise unsigned lt-eq comparison, mask result
314 }
315 
316 static __device__ __forceinline__ unsigned int vcmpltu2(unsigned int a, unsigned int b)
317 {
318  unsigned int r, c;
319 #if __CUDA_ARCH__ >= 300
320  c = 0;
321  asm ("vset2.u32.u32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
322  c = r << 16; // convert bool
323  r = c - r; // into mask
324 #else /* __CUDA_ARCH__ >= 300 */
325  asm ("not.b32 %0,%0;" : "+r"(a));
326  c = vhaddu2 (a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
327  asm ("and.b32 %0,%0,0x80008000;" : "+r"(c)); // msb = carry-outs
328  asm ("shr.u32 %0,%1,15;" : "=r"(r) : "r"(c)); // build mask
329  asm ("sub.u32 %0,%1,%0;" : "+r"(r) : "r"(c)); // from
330  asm ("or.b32 %0,%1,%0;" : "+r"(r) : "r"(c)); // msbs
331 #endif /* __CUDA_ARCH__ >= 300 */
332  return r; // halfword-wise unsigned lt comparison, mask result
333 }
334 
335 static __device__ __forceinline__ unsigned int vcmpne2(unsigned int a, unsigned int b)
336 {
337  unsigned int r, c;
338 #if __CUDA_ARCH__ >= 300
339  c = 0;
340  asm ("vset2.u32.u32.ne %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
341  c = r << 16; // convert bool
342  r = c - r; // into mask
343 #else /* __CUDA_ARCH__ >= 300 */
344  // inspired by Alan Mycroft's null-byte detection algorithm:
345  // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
346  r = a ^ b; // 0x0000 if a == b
347  c = r | 0x80008000; // set msbs, to catch carry out
348  c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
349  c = r | c; // msb = 1, if r was not 0x0000
350  asm ("and.b32 %0,%0,0x80008000;" : "+r"(c)); // extract msbs
351  asm ("shr.u32 %0,%1,15;" : "=r"(r) : "r"(c)); // build mask
352  asm ("sub.u32 %0,%1,%0;" : "+r"(r) : "r"(c)); // from
353  asm ("or.b32 %0,%1,%0;" : "+r"(r) : "r"(c)); // msbs
354 #endif /* __CUDA_ARCH__ >= 300 */
355  return r; // halfword-wise unsigned ne comparison, mask result
356 }
357 
358 static __device__ __forceinline__ unsigned int vabsdiffu2(unsigned int a, unsigned int b)
359 {
360  unsigned int r, s;
361 #if __CUDA_ARCH__ >= 300
362  s = 0;
363  asm ("vabsdiff2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(a),"r"(b),"r"(s));
364 #else /* __CUDA_ARCH__ >= 300 */
365  unsigned int t, u, v;
366  s = a & 0x0000ffff; // extract low halfword
367  r = b & 0x0000ffff; // extract low halfword
368  u = ::max (r, s); // maximum of low halfwords
369  v = ::min (r, s); // minimum of low halfwords
370  s = a & 0xffff0000; // extract high halfword
371  r = b & 0xffff0000; // extract high halfword
372  t = ::max (r, s); // maximum of high halfwords
373  s = ::min (r, s); // minimum of high halfwords
374  r = u | t; // maximum of both halfwords
375  s = v | s; // minimum of both halfwords
376  r = r - s; // |a - b| = max(a,b) - min(a,b);
377 #endif /* __CUDA_ARCH__ >= 300 */
378  return r; // halfword-wide unsigned absolute difference
379 }
380 
381 static __device__ __forceinline__ unsigned int vmaxu2(unsigned int a, unsigned int b)
382 {
383  unsigned int r, s;
384 #if __CUDA_ARCH__ >= 300
385  s = 0;
386  asm ("vmax2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(s));
387 #else /* __CUDA_ARCH__ >= 300 */
388  unsigned int t, u;
389  r = a & 0x0000ffff; // extract low halfword
390  s = b & 0x0000ffff; // extract low halfword
391  t = ::max (r, s); // maximum of low halfwords
392  r = a & 0xffff0000; // extract high halfword
393  s = b & 0xffff0000; // extract high halfword
394  u = ::max (r, s); // maximum of high halfwords
395  r = t | u; // combine halfword maximums
396 #endif /* __CUDA_ARCH__ >= 300 */
397  return r; // halfword-wise unsigned maximum
398 }
399 
400 static __device__ __forceinline__ unsigned int vminu2(unsigned int a, unsigned int b)
401 {
402  unsigned int r, s;
403 #if __CUDA_ARCH__ >= 300
404  s = 0;
405  asm ("vmin2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(s));
406 #else /* __CUDA_ARCH__ >= 300 */
407  unsigned int t, u;
408  r = a & 0x0000ffff; // extract low halfword
409  s = b & 0x0000ffff; // extract low halfword
410  t = ::min (r, s); // minimum of low halfwords
411  r = a & 0xffff0000; // extract high halfword
412  s = b & 0xffff0000; // extract high halfword
413  u = ::min (r, s); // minimum of high halfwords
414  r = t | u; // combine halfword minimums
415 #endif /* __CUDA_ARCH__ >= 300 */
416  return r; // halfword-wise unsigned minimum
417 }
418 
419 static __device__ __forceinline__ unsigned int vseteq2(unsigned int a, unsigned int b)
420 {
421  unsigned int r, c;
422 #if __CUDA_ARCH__ >= 300
423  c = 0;
424  asm ("vset2.u32.u32.eq %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
425 #else /* __CUDA_ARCH__ >= 300 */
426  // inspired by Alan Mycroft's null-byte detection algorithm:
427  // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
428  r = a ^ b; // 0x0000 if a == b
429  c = r | 0x80008000; // set msbs, to catch carry out
430  r = r ^ c; // extract msbs, msb = 1 if r < 0x8000
431  c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
432  c = r & ~c; // msb = 1, if r was 0x0000
433  r = c >> 15; // convert to bool
434 #endif /* __CUDA_ARCH__ >= 300 */
435  return r; // halfword-wise unsigned eq comparison, bool result
436 }
437 
438 static __device__ __forceinline__ unsigned int vsetgeu2(unsigned int a, unsigned int b)
439 {
440  unsigned int r, c;
441 #if __CUDA_ARCH__ >= 300
442  c = 0;
443  asm ("vset2.u32.u32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
444 #else /* __CUDA_ARCH__ >= 300 */
445  asm ("not.b32 %0,%0;" : "+r"(b));
446  c = vavgu2 (a, b); // (a + ~b + 1) / 2 = (a - b) / 2
447  c = c & 0x80008000; // msb = carry-outs
448  r = c >> 15; // convert to bool
449 #endif /* __CUDA_ARCH__ >= 300 */
450  return r; // halfword-wise unsigned gt-eq comparison, bool result
451 }
452 
453 static __device__ __forceinline__ unsigned int vsetgtu2(unsigned int a, unsigned int b)
454 {
455  unsigned int r, c;
456 #if __CUDA_ARCH__ >= 300
457  c = 0;
458  asm ("vset2.u32.u32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
459 #else /* __CUDA_ARCH__ >= 300 */
460  asm ("not.b32 %0,%0;" : "+r"(b));
461  c = vhaddu2 (a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
462  c = c & 0x80008000; // msbs = carry-outs
463  r = c >> 15; // convert to bool
464 #endif /* __CUDA_ARCH__ >= 300 */
465  return r; // halfword-wise unsigned gt comparison, bool result
466 }
467 
468 static __device__ __forceinline__ unsigned int vsetleu2(unsigned int a, unsigned int b)
469 {
470  unsigned int r, c;
471 #if __CUDA_ARCH__ >= 300
472  c = 0;
473  asm ("vset2.u32.u32.le %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
474 #else /* __CUDA_ARCH__ >= 300 */
475  asm ("not.b32 %0,%0;" : "+r"(a));
476  c = vavgu2 (a, b); // (b + ~a + 1) / 2 = (b - a) / 2
477  c = c & 0x80008000; // msb = carry-outs
478  r = c >> 15; // convert to bool
479 #endif /* __CUDA_ARCH__ >= 300 */
480  return r; // halfword-wise unsigned lt-eq comparison, bool result
481 }
482 
483 static __device__ __forceinline__ unsigned int vsetltu2(unsigned int a, unsigned int b)
484 {
485  unsigned int r, c;
486 #if __CUDA_ARCH__ >= 300
487  c = 0;
488  asm ("vset2.u32.u32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
489 #else /* __CUDA_ARCH__ >= 300 */
490  asm ("not.b32 %0,%0;" : "+r"(a));
491  c = vhaddu2 (a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
492  c = c & 0x80008000; // msb = carry-outs
493  r = c >> 15; // convert to bool
494 #endif /* __CUDA_ARCH__ >= 300 */
495  return r; // halfword-wise unsigned lt comparison, bool result
496 }
497 
498 static __device__ __forceinline__ unsigned int vsetne2(unsigned int a, unsigned int b)
499 {
500  unsigned int r, c;
501 #if __CUDA_ARCH__ >= 300
502  c = 0;
503  asm ("vset2.u32.u32.ne %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
504 #else /* __CUDA_ARCH__ >= 300 */
505  // inspired by Alan Mycroft's null-byte detection algorithm:
506  // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
507  r = a ^ b; // 0x0000 if a == b
508  c = r | 0x80008000; // set msbs, to catch carry out
509  c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
510  c = r | c; // msb = 1, if r was not 0x0000
511  c = c & 0x80008000; // extract msbs
512  r = c >> 15; // convert to bool
513 #endif /* __CUDA_ARCH__ >= 300 */
514  return r; // halfword-wise unsigned ne comparison, bool result
515 }
516 
517 static __device__ __forceinline__ unsigned int vsub2(unsigned int a, unsigned int b)
518 {
519  unsigned int s, t;
520 #if __CUDA_ARCH__ >= 300
521  s = 0;
522  asm ("vsub2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(t) : "r"(a), "r"(b), "r"(s));
523 #else /* __CUDA_ARCH__ >= 300 */
524  s = a ^ b; // sum bits
525  t = a - b; // actual sum
526  s = s ^ t; // determine carry-ins for each bit position
527  s = s & 0x00010000; // borrow to high word
528  t = t + s; // compensate for borrow from low word
529 #endif /* __CUDA_ARCH__ >= 300 */
530  return t; // halfword-wise difference
531 }
532 
533 static __device__ __forceinline__ unsigned int vadd4(unsigned int a, unsigned int b)
534 {
535 #if __CUDA_ARCH__ >= 300
536  unsigned int r, c = 0;
537  asm ("vadd4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
538 #else /* __CUDA_ARCH__ >= 300 */
539  unsigned int r, s, t;
540  s = a ^ b; // sum bits
541  r = a & 0x7f7f7f7f; // clear msbs
542  t = b & 0x7f7f7f7f; // clear msbs
543  s = s & 0x80808080; // msb sum bits
544  r = r + t; // add without msbs, record carry-out in msbs
545  r = r ^ s; // sum of msb sum and carry-in bits, w/o carry-out
546 #endif /* __CUDA_ARCH__ >= 300 */
547  return r; // byte-wise sum
548 }
549 
550 static __device__ __forceinline__ unsigned int vaddus4 (unsigned int a, unsigned int b)
551 {
552 #if __CUDA_ARCH__ >= 300
553  unsigned int r, c = 0;
554  asm ("vadd4.u32.u32.u32.sat %0,%1,%2,%3;" : "=r"(r):"r"(a),"r"(b),"r"(c));
555 #else /* __CUDA_ARCH__ >= 300 */
556  // This code uses the same basic approach used for non-saturating addition.
557  // The seven low-order bits in each byte are summed by regular addition,
558  // with the carry-out from bit 6 (= carry-in for the msb) being recorded
559  // in bit 7, while the msb is handled separately.
560  //
561  // The fact that this is a saturating addition simplfies the handling of
562  // the msb. When carry-out from the msb occurs, the entire byte must be
563  // written as 0xff, and the computed msb is overwritten in the process.
564  // The corresponding entries in the truth table for the msb sum bit thus
565  // become "don't cares":
566  //
567  // a b cy-in sum cy-out
568  // ------------------------
569  // 0 0 0 0 0
570  // 0 0 1 1 0
571  // 0 1 0 1 0
572  // 0 1 1 X 1
573  // 1 0 0 1 0
574  // 1 0 1 X 1
575  // 1 1 0 X 1
576  // 1 1 1 X 1
577  //
578  // As is easily seen, the simplest implementation of the sum bit is simply
579  // (a | b) & 0x80808080, with masking needed to isolate the msb. Note that
580  // this computation also makes the msb handling redundant with the clamping
581  // to 0xFF, because the msb is already set to 1 whenever saturation kicks
582  // in. This means we only need to apply saturation to the seven low-order
583  // bits in each byte, by overwriting with 0x7F. Saturation is controlled
584  // by carry-out from the msb, which can be represented by various Boolean
585  // expressions. As we need to compute (a | b) & 0x80808080 anyhow, the most
586  // efficient of these is cy-out = ((a & b) | cy-in) & (a | b) & 0x80808080.
587  unsigned int r;
588  asm ("{ \n\t"
589  ".reg .u32 a,b,r,s,t,m; \n\t"
590  "mov.b32 a, %1; \n\t"
591  "mov.b32 b, %2; \n\t"
592  "or.b32 m, a, b; \n\t"
593  "and.b32 r, a, 0x7f7f7f7f;\n\t" // clear msbs
594  "and.b32 t, b, 0x7f7f7f7f;\n\t" // clear msbs
595  "and.b32 m, m, 0x80808080;\n\t" // result msbs
596  "add.u32 r, r, t; \n\t" // add w/o msbs, record msb-carry-ins
597  "and.b32 t, a, b; \n\t" // (a & b)
598  "or.b32 t, t, r; \n\t" // (a & b) | cy-in)
599  "and.b32 t, t, m; \n\t" // ((a&b)|cy-in) & ((a|b)&0x80808080)
600  "shr.u32 s, t, 7; \n\t" //
601  "sub.u32 t, t, s; \n\t" // lsb-overwrt: msb cy-out ? 0 : 0x7F
602  "or.b32 t, t, m; \n\t" // merge msb and lsb overwrite
603  "or.b32 r, r, t; \n\t" // overwrite bits with 1 as needed
604  "mov.b32 %0, r; \n\t"
605  "}"
606  : "=r"(r) : "r"(a), "r"(b));
607 #endif /* __CUDA_ARCH__ >= 300 */
608  return r;
609 }
610 
611 static __device__ __forceinline__ unsigned int vaddss4 (unsigned int a, unsigned int b)
612 {
613 #if __CUDA_ARCH__ >= 300
614  unsigned int r, c = 0;
615  asm ("vadd4.sat.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r):"r"(a),"r"(b),"r"(c));
616 #else /* __CUDA_ARCH__ >= 300 */
617  /*
618  For signed saturation, saturation is controlled by the overflow signal:
619  ovfl = (carry-in to msb) XOR (carry-out from msb). Overflow can only
620  occur when the msbs of both inputs are the same. The defined response to
621  overflow is to deliver 0x7f when the addends are positive (bit 7 clear),
622  and 0x80 when the addends are negative (bit 7 set). The truth table for
623  the msb is
624 
625  a b cy_in sum cy_out ovfl
626  --------------------------------
627  0 0 0 0 0 0
628  0 0 1 1 0 1
629  0 1 0 1 0 0
630  0 1 1 0 1 0
631  1 0 0 1 0 0
632  1 0 1 0 1 0
633  1 1 0 0 1 1
634  1 1 1 1 1 0
635 
636  The seven low-order bits can be handled by simple wrapping addition with
637  the carry out from bit 6 recorded in the msb (thus corresponding to the
638  cy_in in the truth table for the msb above). ovfl can be computed in many
639  equivalent ways, here we use ovfl = (a ^ carry_in) & ~(a ^ b) since we
640  already need to compute (a ^ b) for the msb sum bit computation. First we
641  compute the normal, wrapped addition result. When overflow is detected,
642  we mask off the msb of the result, then compute a mask covering the seven
643  low order bits, which are all set to 1. This sets the byte to 0x7f as we
644  previously cleared the msb. In the overflow case, the sign of the result
645  matches the sign of either of the inputs, so we extract the sign of a and
646  add it to the low order bits, which turns 0x7f into 0x80, the correct
647  result for an overflowed negative result.
648  */
649  unsigned int r;
650  asm ("{ \n\t"
651  ".reg .u32 a,b,r,s,t,u; \n\t"
652  "mov.b32 a, %1; \n\t"
653  "mov.b32 b, %2; \n\t"
654  "xor.b32 s, a, b; \n\t" // sum bits
655  "and.b32 r, a, 0x7f7f7f7f;\n\t" // clear msbs
656  "and.b32 t, b, 0x7f7f7f7f;\n\t" // clear msbs
657  "and.b32 s, s, 0x80808080;\n\t" // msb sum bits
658  "add.u32 r, r, t; \n\t" // capture msb carry-in in bit 7
659  "xor.b32 t, a, r; \n\t" // a ^ carry_in
660  "xor.b32 r, r, s; \n\t" // msb sum bit = (a ^ b ^ carry_in)
661  "not.b32 s, s; \n\t" // ~(a ^ b)
662  "and.b32 t, t, s; \n\t" // ovfl = (a ^ carry_in) & ~(a ^ b)
663  "and.b32 t, t, 0x80808080;\n\t" // ovfl ? 0x80 : 0
664  "shr.u32 s, t, 7; \n\t" // ovfl ? 1 : 0
665  "not.b32 u, t; \n\t" // ovfl ? 0x7f : 0xff
666  "and.b32 r, r, u; \n\t" // ovfl ? (a + b) & 0x7f : a + b
667  "and.b32 u, a, t; \n\t" // ovfl ? a & 0x80 : 0
668  "sub.u32 t, t, s; \n\t" // ovfl ? 0x7f : 0
669  "shr.u32 u, u, 7; \n\t" // ovfl ? sign(a) : 0
670  "or.b32 r, r, t; \n\t" // ovfl ? 0x7f : a + b
671  "add.u32 r, r, u; \n\t" // ovfl ? 0x7f+sign(a) : a + b
672  "mov.b32 %0, r; \n\t"
673  "}"
674  : "=r"(r) : "r"(a), "r"(b));
675 #endif /* __CUDA_ARCH__ >= 300 */
676  return r;
677 }
678 
679 static __device__ __forceinline__ unsigned int vavgs4(unsigned int a, unsigned int b)
680 {
681  unsigned int r;
682 #if __CUDA_ARCH__ >= 300
683  unsigned int c = 0;
684  asm ("vavrg4.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
685 #else /* __CUDA_ARCH__ >= 300 */
686  // avgs (a + b) = ((a + b) < 0) ? ((a + b) >> 1) : ((a + b + 1) >> 1). The
687  // two expressions can be re-written as follows to avoid needing additional
688  // intermediate bits: ((a + b) >> 1) = (a >> 1) + (b >> 1) + ((a & b) & 1),
689  // ((a + b + 1) >> 1) = (a >> 1) + (b >> 1) + ((a | b) & 1). The difference
690  // between the two is ((a ^ b) & 1). Note that if (a + b) < 0, then also
691  // ((a + b) >> 1) < 0, since right shift rounds to negative infinity. This
692  // means we can compute ((a + b) >> 1) then conditionally add ((a ^ b) & 1)
693  // depending on the sign bit of the shifted sum. By handling the msb sum
694  // bit of the result separately, we avoid carry-out during summation and
695  // also can use (potentially faster) logical right shifts.
696  asm ("{ \n\t"
697  ".reg .u32 a,b,c,r,s,t,u,v;\n\t"
698  "mov.b32 a,%1; \n\t"
699  "mov.b32 b,%2; \n\t"
700  "and.b32 u,a,0xfefefefe;\n\t" // prevent shift crossing chunk boundary
701  "and.b32 v,b,0xfefefefe;\n\t" // prevent shift crossing chunk boundary
702  "xor.b32 s,a,b; \n\t" // a ^ b
703  "and.b32 t,a,b; \n\t" // a & b
704  "shr.u32 u,u,1; \n\t" // a >> 1
705  "shr.u32 v,v,1; \n\t" // b >> 1
706  "and.b32 c,s,0x01010101;\n\t" // (a ^ b) & 1
707  "and.b32 s,s,0x80808080;\n\t" // extract msb (a ^ b)
708  "and.b32 t,t,0x01010101;\n\t" // (a & b) & 1
709  "add.u32 r,u,v; \n\t" // (a>>1)+(b>>1)
710  "add.u32 r,r,t; \n\t" // (a>>1)+(b>>1)+(a&b&1); rec. msb cy-in
711  "xor.b32 r,r,s; \n\t" // compute masb sum bit: a ^ b ^ cy-in
712  "shr.u32 t,r,7; \n\t" // sign ((a + b) >> 1)
713  "not.b32 t,t; \n\t" // ~sign ((a + b) >> 1)
714  "and.b32 t,t,c; \n\t" // ((a ^ b) & 1) & ~sign ((a + b) >> 1)
715  "add.u32 r,r,t; \n\t" // conditionally add ((a ^ b) & 1)
716  "mov.b32 %0,r; \n\t"
717  "}"
718  : "=r"(r) : "r"(a), "r"(b));
719 #endif /* __CUDA_ARCH__ >= 300 */
720  return r; // halfword-wise signed average
721 }
722 
723 static __device__ __forceinline__ unsigned int vavgu4(unsigned int a, unsigned int b)
724 {
725  unsigned int r, c;
726 #if __CUDA_ARCH__ >= 300
727  c = 0;
728  asm ("vavrg4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
729 #else /* __CUDA_ARCH__ >= 300 */
730  // HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==>
731  // (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1)
732  c = a ^ b;
733  r = a | b;
734  c = c & 0xfefefefe; // ensure following shift doesn't cross byte boundaries
735  c = c >> 1;
736  r = r - c;
737 #endif /* __CUDA_ARCH__ >= 300 */
738  return r; // byte-wise unsigned average
739 }
740 
741 static __device__ __forceinline__ unsigned int vhaddu4(unsigned int a, unsigned int b)
742 {
743  // HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==>
744  // (a + b) / 2 = (a & b) + ((a ^ b) >> 1)
745  unsigned int r, s;
746  s = a ^ b;
747  r = a & b;
748  s = s & 0xfefefefe; // ensure following shift doesn't cross byte boundaries
749  s = s >> 1;
750  s = r + s;
751  return s; // byte-wise unsigned average [rounded down]
752 }
753 
754 static __device__ __forceinline__ unsigned int vcmpeq4(unsigned int a, unsigned int b)
755 {
756  unsigned int c, r;
757 #if __CUDA_ARCH__ >= 300
758  r = 0;
759  asm ("vset4.u32.u32.eq %0,%1,%2,%3;" : "=r"(c) : "r"(a), "r"(b), "r"(r));
760  r = c << 8; // convert bool
761  r = r - c; // to mask
762 #else /* __CUDA_ARCH__ >= 300 */
763  // inspired by Alan Mycroft's null-byte detection algorithm:
764  // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
765  r = a ^ b; // 0x00 if a == b
766  c = r | 0x80808080; // set msbs, to catch carry out
767  r = r ^ c; // extract msbs, msb = 1 if r < 0x80
768  c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
769  c = r & ~c; // msb = 1, if r was 0x00
770  r = c >> 7; // build mask
771  r = c - r; // from
772  r = r | c; // msbs
773 #endif /* __CUDA_ARCH__ >= 300 */
774  return r; // byte-wise unsigned eq comparison with mask result
775 }
776 
777 static __device__ __forceinline__ unsigned int vcmpgeu4(unsigned int a, unsigned int b)
778 {
779  unsigned int r, c;
780 #if __CUDA_ARCH__ >= 300
781  c = 0;
782  asm ("vset4.u32.u32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
783  c = r << 8; // convert bool
784  r = c - r; // to mask
785 #else /* __CUDA_ARCH__ >= 300 */
786  asm ("not.b32 %0,%0;" : "+r"(b));
787  c = vavgu4 (a, b); // (a + ~b + 1) / 2 = (a - b) / 2
788  asm ("and.b32 %0,%0,0x80808080;" : "+r"(c)); // msb = carry-outs
789  asm ("shr.u32 %0,%1,7;" : "=r"(r) : "r"(c)); // build mask
790  asm ("sub.u32 %0,%1,%0;" : "+r"(r) : "r"(c)); // from
791  asm ("or.b32 %0,%1,%0;" : "+r"(r) : "r"(c)); // msbs
792 #endif /* __CUDA_ARCH__ >= 300 */
793  return r; // byte-wise unsigned gt-eq comparison with mask result
794 }
795 
796 static __device__ __forceinline__ unsigned int vcmpgtu4(unsigned int a, unsigned int b)
797 {
798  unsigned int r, c;
799 #if __CUDA_ARCH__ >= 300
800  c = 0;
801  asm ("vset4.u32.u32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
802  c = r << 8; // convert bool
803  r = c - r; // to mask
804 #else /* __CUDA_ARCH__ >= 300 */
805  asm ("not.b32 %0,%0;" : "+r"(b));
806  c = vhaddu4 (a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
807  asm ("and.b32 %0,%0,0x80808080;" : "+r"(c)); // msb = carry-outs
808  asm ("shr.u32 %0,%1,7;" : "=r"(r) : "r"(c)); // build mask
809  asm ("sub.u32 %0,%1,%0;" : "+r"(r) : "r"(c)); // from
810  asm ("or.b32 %0,%1,%0;" : "+r"(r) : "r"(c)); // msbs
811 #endif /* __CUDA_ARCH__ >= 300 */
812  return r; // byte-wise unsigned gt comparison with mask result
813 }
814 
815 static __device__ __forceinline__ unsigned int vcmpleu4(unsigned int a, unsigned int b)
816 {
817  unsigned int r, c;
818 #if __CUDA_ARCH__ >= 300
819  c = 0;
820  asm ("vset4.u32.u32.le %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
821  c = r << 8; // convert bool
822  r = c - r; // to mask
823 #else /* __CUDA_ARCH__ >= 300 */
824  asm ("not.b32 %0,%0;" : "+r"(a));
825  c = vavgu4 (a, b); // (b + ~a + 1) / 2 = (b - a) / 2
826  asm ("and.b32 %0,%0,0x80808080;" : "+r"(c)); // msb = carry-outs
827  asm ("shr.u32 %0,%1,7;" : "=r"(r) : "r"(c)); // build mask
828  asm ("sub.u32 %0,%1,%0;" : "+r"(r) : "r"(c)); // from
829  asm ("or.b32 %0,%1,%0;" : "+r"(r) : "r"(c)); // msbs
830 #endif /* __CUDA_ARCH__ >= 300 */
831  return r; // byte-wise unsigned lt-eq comparison with bool result
832 }
833 
834 static __device__ __forceinline__ unsigned int vcmpltu4(unsigned int a, unsigned int b)
835 {
836  unsigned int r, c;
837 #if __CUDA_ARCH__ >= 300
838  c = 0;
839  asm ("vset4.u32.u32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
840  c = r << 8; // convert bool
841  r = c - r; // to mask
842 #else /* __CUDA_ARCH__ >= 300 */
843  asm ("not.b32 %0,%0;" : "+r"(a));
844  c = vhaddu4 (a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
845  asm ("and.b32 %0,%0,0x80808080;" : "+r"(c)); // msb = carry-outs
846  asm ("shr.u32 %0,%1,7;" : "=r"(r) : "r"(c)); // build mask
847  asm ("sub.u32 %0,%1,%0;" : "+r"(r) : "r"(c)); // from
848  asm ("or.b32 %0,%1,%0;" : "+r"(r) : "r"(c)); // msbs
849 #endif /* __CUDA_ARCH__ >= 300 */
850  return r; // byte-wise unsigned lt comparison with mask result
851 }
852 
853 static __device__ __forceinline__ unsigned int vcmpne4(unsigned int a, unsigned int b)
854 {
855  unsigned int r, c;
856 #if __CUDA_ARCH__ >= 300
857  c = 0;
858  asm ("vset4.u32.u32.ne %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
859  c = r << 8; // convert bool
860  r = c - r; // to mask
861 #else /* __CUDA_ARCH__ >= 300 */
862  // inspired by Alan Mycroft's null-byte detection algorithm:
863  // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
864  r = a ^ b; // 0x00 if a == b
865  c = r | 0x80808080; // set msbs, to catch carry out
866  c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
867  c = r | c; // msb = 1, if r was not 0x00
868  asm ("and.b32 %0,%0,0x80808080;" : "+r"(c)); // extract msbs
869  asm ("shr.u32 %0,%1,7;" : "=r"(r) : "r"(c)); // build mask
870  asm ("sub.u32 %0,%1,%0;" : "+r"(r) : "r"(c)); // from
871  asm ("or.b32 %0,%1,%0;" : "+r"(r) : "r"(c)); // msbs
872 #endif /* __CUDA_ARCH__ >= 300 */
873  return r; // byte-wise unsigned ne comparison with mask result
874 }
875 
876 static __device__ __forceinline__ unsigned int vabsdiffu4(unsigned int a, unsigned int b)
877 {
878  unsigned int r, s;
879 #if __CUDA_ARCH__ >= 300
880  s = 0;
881  asm ("vabsdiff4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(a),"r"(b),"r"(s));
882 #else /* __CUDA_ARCH__ >= 300 */
883  s = vcmpgeu4 (a, b);// mask = 0xff if a >= b
884  r = a ^ b; //
885  s = (r & s) ^ b; // select a when a >= b, else select b => max(a,b)
886  r = s ^ r; // select a when b >= a, else select b => min(a,b)
887  r = s - r; // |a - b| = max(a,b) - min(a,b);
888 #endif /* __CUDA_ARCH__ >= 300 */
889  return r; // byte-wise unsigned minimum
890 }
891 
892 static __device__ __forceinline__ unsigned int vmaxu4(unsigned int a, unsigned int b)
893 {
894  unsigned int r, s;
895 #if __CUDA_ARCH__ >= 300
896  s = 0;
897  asm ("vmax4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(s));
898 #else /* __CUDA_ARCH__ >= 300 */
899  s = vcmpgeu4 (a, b);// mask = 0xff if a >= b
900  r = a & s; // select a when b >= a
901  s = b & ~s; // select b when b < a
902  r = r | s; // combine byte selections
903 #endif /* __CUDA_ARCH__ >= 300 */
904  return r; // byte-wise unsigned maximum
905 }
906 
907 static __device__ __forceinline__ unsigned int vminu4(unsigned int a, unsigned int b)
908 {
909  unsigned int r, s;
910 #if __CUDA_ARCH__ >= 300
911  s = 0;
912  asm ("vmin4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(s));
913 #else /* __CUDA_ARCH__ >= 300 */
914  s = vcmpgeu4 (b, a);// mask = 0xff if a >= b
915  r = a & s; // select a when b >= a
916  s = b & ~s; // select b when b < a
917  r = r | s; // combine byte selections
918 #endif /* __CUDA_ARCH__ >= 300 */
919  return r; // byte-wise unsigned minimum
920 }
921 
922 static __device__ __forceinline__ unsigned int vseteq4(unsigned int a, unsigned int b)
923 {
924  unsigned int r, c;
925 #if __CUDA_ARCH__ >= 300
926  c = 0;
927  asm ("vset4.u32.u32.eq %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
928 #else /* __CUDA_ARCH__ >= 300 */
929  // inspired by Alan Mycroft's null-byte detection algorithm:
930  // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
931  r = a ^ b; // 0x00 if a == b
932  c = r | 0x80808080; // set msbs, to catch carry out
933  r = r ^ c; // extract msbs, msb = 1 if r < 0x80
934  c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
935  c = r & ~c; // msb = 1, if r was 0x00
936  r = c >> 7; // convert to bool
937 #endif /* __CUDA_ARCH__ >= 300 */
938  return r; // byte-wise unsigned eq comparison with bool result
939 }
940 
941 static __device__ __forceinline__ unsigned int vsetleu4(unsigned int a, unsigned int b)
942 {
943  unsigned int r, c;
944 #if __CUDA_ARCH__ >= 300
945  c = 0;
946  asm ("vset4.u32.u32.le %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
947 #else /* __CUDA_ARCH__ >= 300 */
948  asm ("not.b32 %0,%0;" : "+r"(a));
949  c = vavgu4 (a, b); // (b + ~a + 1) / 2 = (b - a) / 2
950  c = c & 0x80808080; // msb = carry-outs
951  r = c >> 7; // convert to bool
952 #endif /* __CUDA_ARCH__ >= 300 */
953  return r; // byte-wise unsigned lt-eq comparison with bool result
954 }
955 
956 static __device__ __forceinline__ unsigned int vsetltu4(unsigned int a, unsigned int b)
957 {
958  unsigned int r, c;
959 #if __CUDA_ARCH__ >= 300
960  c = 0;
961  asm ("vset4.u32.u32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
962 #else /* __CUDA_ARCH__ >= 300 */
963  asm ("not.b32 %0,%0;" : "+r"(a));
964  c = vhaddu4 (a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
965  c = c & 0x80808080; // msb = carry-outs
966  r = c >> 7; // convert to bool
967 #endif /* __CUDA_ARCH__ >= 300 */
968  return r; // byte-wise unsigned lt comparison with bool result
969 }
970 
971 static __device__ __forceinline__ unsigned int vsetgeu4(unsigned int a, unsigned int b)
972 {
973  unsigned int r, c;
974 #if __CUDA_ARCH__ >= 300
975  c = 0;
976  asm ("vset4.u32.u32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
977 #else /* __CUDA_ARCH__ >= 300 */
978  asm ("not.b32 %0,%0;" : "+r"(b));
979  c = vavgu4 (a, b); // (a + ~b + 1) / 2 = (a - b) / 2
980  c = c & 0x80808080; // msb = carry-outs
981  r = c >> 7; // convert to bool
982 #endif /* __CUDA_ARCH__ >= 300 */
983  return r; // byte-wise unsigned gt-eq comparison with bool result
984 }
985 
986 static __device__ __forceinline__ unsigned int vsetgtu4(unsigned int a, unsigned int b)
987 {
988  unsigned int r, c;
989 #if __CUDA_ARCH__ >= 300
990  c = 0;
991  asm ("vset4.u32.u32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
992 #else /* __CUDA_ARCH__ >= 300 */
993  asm ("not.b32 %0,%0;" : "+r"(b));
994  c = vhaddu4 (a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
995  c = c & 0x80808080; // msb = carry-outs
996  r = c >> 7; // convert to bool
997 #endif /* __CUDA_ARCH__ >= 300 */
998  return r; // byte-wise unsigned gt comparison with bool result
999 }
1000 
1001 static __device__ __forceinline__ unsigned int vsetne4(unsigned int a, unsigned int b)
1002 {
1003  unsigned int r, c;
1004 #if __CUDA_ARCH__ >= 300
1005  c = 0;
1006  asm ("vset4.u32.u32.ne %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
1007 #else /* __CUDA_ARCH__ >= 300 */
1008  // inspired by Alan Mycroft's null-byte detection algorithm:
1009  // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
1010  r = a ^ b; // 0x00 if a == b
1011  c = r | 0x80808080; // set msbs, to catch carry out
1012  c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
1013  c = r | c; // msb = 1, if r was not 0x00
1014  c = c & 0x80808080; // extract msbs
1015  r = c >> 7; // convert to bool
1016 #endif /* __CUDA_ARCH__ >= 300 */
1017  return r; // byte-wise unsigned ne comparison with bool result
1018 }
1019 
1020 static __device__ __forceinline__ unsigned int vsub4(unsigned int a, unsigned int b)
1021 {
1022 #if __CUDA_ARCH__ >= 300
1023  unsigned int r, c = 0;
1024  asm ("vsub4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
1025 #else /* __CUDA_ARCH__ >= 300 */
1026  unsigned int r, s, t;
1027  s = a ^ ~b; // inverted sum bits
1028  r = a | 0x80808080; // set msbs
1029  t = b & 0x7f7f7f7f; // clear msbs
1030  s = s & 0x80808080; // inverted msb sum bits
1031  r = r - t; // subtract w/o msbs, record inverted borrows in msb
1032  r = r ^ s; // combine inverted msb sum bits and borrows
1033 #endif /* __CUDA_ARCH__ >= 300 */
1034  return r; // byte-wise difference
1035 }
1036 
1037 #endif /* SIMD_FUNCTIONS_H__ */