NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
simd_functions.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are met:
6  *
7  * Redistributions of source code must retain the above copyright notice,
8  * this list of conditions and the following disclaimer.
9  *
10  * Redistributions in binary form must reproduce the above copyright notice,
11  * this list of conditions and the following disclaimer in the documentation
12  * and/or other materials provided with the distribution.
13  *
14  * Neither the name of NVIDIA Corporation nor the names of its contributors
15  * may be used to endorse or promote products derived from this software
16  * without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGE.
29  */
30 
31 /* Release 1.1
32  *
33  * (1) Use of incorrect symbol in multiple-inclusion guard has been corrected.
34  * (2) 44 additional functions were added to the initial set of 38 functions.
35  * (3) The emulation paths for many existing functions were optimized for sm_2x
36  */
37 
38 #if !defined (SIMD_FUNCTIONS_H__)
39 #define SIMD_FUNCTIONS_H__
40 
41 /*
42  This header file contains inline functions that implement intra-word SIMD
43  operations, that are hardware accelerated on sm_3x (Kepler) GPUs. Efficient
44  emulation code paths are provided for earlier architectures (sm_1x, sm_2x)
45  to make the code portable across all GPUs supported by CUDA. The following
46  functions are currently implemented:
47 
48  vabs2(a) per-halfword absolute value, with wrap-around: |a|
49  vabsdiffs2(a,b) per-halfword absolute difference of signed integer: |a - b|
50  vabsdiffu2(a,b) per-halfword absolute difference of unsigned integer: |a - b|
51  vabsss2(a) per-halfword abs. value, with signed saturation: sat.s16(|a|)
52  vadd2(a,b) per-halfword (un)signed addition, with wrap-around: a + b
53  vaddss2(a,b) per-halfword addition with signed saturation: sat.s16 (a + b)
54  vaddus2(a,b) per-halfword addition with unsigned saturation: sat.u16 (a+b)
55  vavgs2(a,b) per-halfword signed rounded average: (a+b+((a+b)>=0)) >> 1
56  vavgu2(a,b) per-halfword unsigned rounded average: (a + b + 1) / 2
57  vcmpeq2(a,b) per-halfword (un)signed comparison: a == b ? 0xffff : 0
58  vcmpges2(a,b) per-halfword signed comparison: a >= b ? 0xffff : 0
59  vcmpgeu2(a,b) per-halfword unsigned comparison: a >= b ? 0xffff : 0
60  vcmpgts2(a,b) per-halfword signed comparison: a > b ? 0xffff : 0
61  vcmpgtu2(a,b) per-halfword unsigned comparison: a > b ? 0xffff : 0
62  vcmples2(a,b) per-halfword signed comparison: a <= b ? 0xffff : 0
63  vcmpleu2(a,b) per-halfword unsigned comparison: a <= b ? 0xffff : 0
64  vcmplts2(a,b) per-halfword signed comparison: a < b ? 0xffff : 0
65  vcmpltu2(a,b) per-halfword unsigned comparison: a < b ? 0xffff : 0
66  vcmpne2(a,b) per-halfword (un)signed comparison: a != b ? 0xffff : 0
67  vhaddu2(a,b) per-halfword unsigned average: (a + b) / 2
68  vmaxs2(a,b) per-halfword signed maximum: max(a, b)
69  vmaxu2(a,b) per-halfword unsigned maximum: max(a, b)
70  vmins2(a,b) per-halfword signed minimum: min(a, b)
71  vminu2(a,b) per-halfword unsigned minimum: min(a, b)
72  vneg2(a,b) per-halfword negation, with wrap-around: -a
73  vnegss2(a,b) per-halfword negation, with signed saturation: sat.s16(-a)
74  vsads2(a,b) per-halfword sum of abs diff of signed: sum{0,1}(|a-b|)
75  vsadu2(a,b) per-halfword sum of abs diff of unsigned: sum{0,1}(|a-b|)
76  vseteq2(a,b) per-halfword (un)signed comparison: a == b ? 1 : 0
77  vsetges2(a,b) per-halfword signed comparison: a >= b ? 1 : 0
78  vsetgeu2(a,b) per-halfword unsigned comparison: a >= b ? 1 : 0
79  vsetgts2(a,b) per-halfword signed comparison: a > b ? 1 : 0
80  vsetgtu2(a,b) per-halfword unsigned comparison: a > b ? 1 : 0
81  vsetles2(a,b) per-halfword signed comparison: a <= b ? 1 : 0
82  vsetleu2(a,b) per-halfword unsigned comparison: a <= b ? 1 : 0
83  vsetlts2(a,b) per-halfword signed comparison: a < b ? 1 : 0
84  vsetltu2(a,b) per-halfword unsigned comparison: a < b ? 1 : 0
85  vsetne2(a,b) per-halfword (un)signed comparison: a != b ? 1 : 0
86  vsub2(a,b) per-halfword (un)signed subtraction, with wrap-around: a - b
87  vsubss2(a,b) per-halfword subtraction with signed saturation: sat.s16(a-b)
88  vsubus2(a,b) per-halfword subtraction w/ unsigned saturation: sat.u16(a-b)
89 
90  vabs4(a) per-byte absolute value, with wrap-around: |a|
91  vabsdiffs4(a,b) per-byte absolute difference of signed integer: |a - b|
92  vabsdiffu4(a,b) per-byte absolute difference of unsigned integer: |a - b|
93  vabsss4(a) per-byte absolute value, with signed saturation: sat.s8(|a|)
94  vadd4(a,b) per-byte (un)signed addition, with wrap-around: a + b
95  vaddss4(a,b) per-byte addition with signed saturation: sat.s8 (a + b)
96  vaddus4(a,b) per-byte addition with unsigned saturation: sat.u8 (a + b)
97  vavgs4(a,b) per-byte signed rounded average: (a + b + ((a+b) >= 0)) >> 1
98  vavgu4(a,b) per-byte unsigned rounded average: (a + b + 1) / 2
99  vcmpeq4(a,b) per-byte (un)signed comparison: a == b ? 0xff : 0
100  vcmpges4(a,b) per-byte signed comparison: a >= b ? 0xff : 0
101  vcmpgeu4(a,b) per-byte unsigned comparison: a >= b ? 0xff : 0
102  vcmpgts4(a,b) per-byte signed comparison: a > b ? 0xff : 0
103  vcmpgtu4(a,b) per-byte unsigned comparison: a > b ? 0xff : 0
104  vcmples4(a,b) per-byte signed comparison: a <= b ? 0xff : 0
105  vcmpleu4(a,b) per-byte unsigned comparison: a <= b ? 0xff : 0
106  vcmplts4(a,b) per-byte signed comparison: a < b ? 0xff : 0
107  vcmpltu4(a,b) per-byte unsigned comparison: a < b ? 0xff : 0
108  vcmpne4(a,b) per-byte (un)signed comparison: a != b ? 0xff: 0
109  vhaddu4(a,b) per-byte unsigned average: (a + b) / 2
110  vmaxs4(a,b) per-byte signed maximum: max(a, b)
111  vmaxu4(a,b) per-byte unsigned maximum: max(a, b)
112  vmins4(a,b) per-byte signed minimum: min(a, b)
113  vminu4(a,b) per-byte unsigned minimum: min(a, b)
114  vneg4(a,b) per-byte negation, with wrap-around: -a
115  vnegss4(a,b) per-byte negation, with signed saturation: sat.s8(-a)
116  vsads4(a,b) per-byte sum of abs difference of signed: sum{0,3}(|a-b|)
117  vsadu4(a,b) per-byte sum of abs difference of unsigned: sum{0,3}(|a-b|)
118  vseteq4(a,b) per-byte (un)signed comparison: a == b ? 1 : 0
119  vsetges4(a,b) per-byte signed comparison: a >= b ? 1 : 0
120  vsetgeu4(a,b) per-byte unsigned comparison: a >= b ? 1 : 0
121  vsetgts4(a,b) per-byte signed comparison: a > b ? 1 : 0
122  vsetgtu4(a,b) per-byte unsigned comparison: a > b ? 1 : 0
123  vsetles4(a,b) per-byte signed comparison: a <= b ? 1 : 0
124  vsetleu4(a,b) per-byte unsigned comparison: a <= b ? 1 : 0
125  vsetlts4(a,b) per-byte signed comparison: a < b ? 1 : 0
126  vsetltu4(a,b) per-byte unsigned comparison: a < b ? 1 : 0
127  vsetne4(a,b) per-byte (un)signed comparison: a != b ? 1: 0
128  vsub4(a,b) per-byte (un)signed subtraction, with wrap-around: a - b
129  vsubss4(a,b) per-byte subtraction with signed saturation: sat.s8 (a - b)
130  vsubus4(a,b) per-byte subtraction with unsigned saturation: sat.u8 (a - b)
131 */
132 
133 static __device__ __forceinline__ unsigned int vabs2(unsigned int a)
134 {
135  unsigned int r;
136 #if __CUDA_ARCH__ >= 300
137  unsigned int b = 0, c = 0;
138  asm ("vabsdiff2.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) :"r"(a),"r"(b),"r"(c));
139 #elif __CUDA_ARCH__ >= 200
140  asm ("{ \n\t"
141  ".reg .u32 a,m,r; \n\t"
142  "mov.b32 a,%1; \n\t"
143  "prmt.b32 m,a,0,0xbb99; \n\t" // msb ? 0xffff : 0000
144  "xor.b32 r,a,m; \n\t" // conditionally invert bits
145  "and.b32 m,m,0x00010001;\n\t" // msb ? 0x1 : 0
146  "add.u32 r,r,m; \n\t" // conditionally add 1
147  "mov.b32 %0,r; \n\t"
148  "}"
149  : "=r"(r) : "r"(a));
150 #else /* __CUDA_ARCH__ >= 200 */
151  asm ("{ \n\t"
152  ".reg .u32 a,m,r,s; \n\t"
153  "mov.b32 a,%1; \n\t"
154  "and.b32 m,a,0x80008000;\n\t" // extract msb
155  "and.b32 r,a,0x7fff7fff;\n\t" // clear msb
156  "shr.u32 s,m,15; \n\t" // build lsb mask
157  "sub.u32 m,m,s; \n\t" // from msb
158  "xor.b32 r,r,m; \n\t" // conditionally invert lsbs
159  "add.u32 r,r,s; \n\t" // conditionally add 1
160  "mov.b32 %0,r; \n\t"
161  "}"
162  : "=r"(r) : "r"(a));
163 #endif /* __CUDA_ARCH__ >= 200 */
164  return r; // halfword-wise absolute value, with wrap-around
165 }
166 
167 static __device__ __forceinline__ unsigned int vabsss2(unsigned int a)
168 {
169  unsigned int r;
170 #if __CUDA_ARCH__ >= 300
171  unsigned int b = 0, c = 0;
172  asm("vabsdiff2.s32.s32.s32.sat %0,%1,%2,%3;":"=r"(r):"r"(a),"r"(b),"r"(c));
173 #elif __CUDA_ARCH__ >= 200
174  asm ("{ \n\t"
175  ".reg .u32 a,m,r; \n\t"
176  "mov.b32 a,%1; \n\t"
177  "prmt.b32 m,a,0,0xbb99; \n\t" // msb ? 0xffff : 0000
178  "xor.b32 r,a,m; \n\t" // conditionally invert bits
179  "and.b32 m,m,0x00010001;\n\t" // msb ? 0x1 : 0
180  "add.u32 r,r,m; \n\t" // conditionally add 1
181  "prmt.b32 m,r,0,0xbb99; \n\t" // msb ? 0xffff : 0000
182  "and.b32 m,m,0x00010001;\n\t" // msb ? 0x1 : 0
183  "sub.u32 r,r,m; \n\t" // subtract 1 if result wrapped around
184  "mov.b32 %0,r; \n\t"
185  "}"
186  : "=r"(r) : "r"(a));
187 #else /* __CUDA_ARCH__ >= 200 */
188  asm ("{ \n\t"
189  ".reg .u32 a,m,r,s; \n\t"
190  "mov.b32 a,%1; \n\t"
191  "and.b32 m,a,0x80008000;\n\t" // extract msb
192  "and.b32 r,a,0x7fff7fff;\n\t" // clear msb
193  "shr.u32 s,m,15; \n\t" // build lsb mask
194  "sub.u32 m,m,s; \n\t" // from msb
195  "xor.b32 r,r,m; \n\t" // conditionally invert lsbs
196  "add.u32 r,r,s; \n\t" // conditionally add 1
197  "and.b32 m,r,0x80008000;\n\t" // extract msb (1 if wrap-around)
198  "shr.u32 s,m,15; \n\t" // msb ? 1 : 0
199  "sub.u32 r,r,s; \n\t" // subtract 1 if result wrapped around
200  "mov.b32 %0,r; \n\t"
201  "}"
202  : "=r"(r) : "r"(a));
203 #endif /* __CUDA_ARCH__ >= 200 */
204  return r; // halfword-wise absolute value with signed saturation
205 }
206 
207 static __device__ __forceinline__ unsigned int vadd2(unsigned int a, unsigned int b)
208 {
209  unsigned int s, t;
210 #if __CUDA_ARCH__ >= 300
211  s = 0;
212  asm ("vadd2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(t) : "r"(a), "r"(b), "r"(s));
213 #else /* __CUDA_ARCH__ >= 300 */
214  s = a ^ b; // sum bits
215  t = a + b; // actual sum
216  s = s ^ t; // determine carry-ins for each bit position
217  s = s & 0x00010000; // carry-in to high word (= carry-out from low word)
218  t = t - s; // subtract out carry-out from low word
219 #endif /* __CUDA_ARCH__ >= 300 */
220  return t; // halfword-wise sum, with wrap around
221 }
222 
223 static __device__ __forceinline__ unsigned int vaddss2 (unsigned int a, unsigned int b)
224 {
225  unsigned int r;
226 #if __CUDA_ARCH__ >= 300
227  unsigned int c = 0;
228  asm ("vadd2.s32.s32.s32.sat %0,%1,%2,%3;" : "=r"(r):"r"(a),"r"(b),"r"(c));
229 #else /* __CUDA_ARCH__ >= 300 */
230  int ahi, alo, blo, bhi, rhi, rlo;
231  ahi = (int)((a & 0xffff0000U));
232  bhi = (int)((b & 0xffff0000U));
233 #if __CUDA_ARCH__ < 200
234  alo = (int)(a << 16);
235  blo = (int)(b << 16);
236 #elif __CUDA_ARCH__ < 350
237  // work around (we would want left shifts at least for sm_2x)
238  asm ("prmt.b32 %0,%1,0,0x1044;" : "=r"(alo) : "r"(a));
239  asm ("prmt.b32 %0,%1,0,0x1044;" : "=r"(blo) : "r"(b));
240 #else
241  asm ("shf.l.clamp.b32 %0,0,%1,16;" : "=r"(alo) : "r"(a));
242  asm ("shf.l.clamp.b32 %0,0,%1,16;" : "=r"(blo) : "r"(b));
243 #endif
244  asm ("add.sat.s32 %0,%1,%2;" : "=r"(rlo) : "r"(alo), "r"(blo));
245  asm ("add.sat.s32 %0,%1,%2;" : "=r"(rhi) : "r"(ahi), "r"(bhi));
246 #if __CUDA_ARCH__ < 200
247  r = ((unsigned int)rhi & 0xffff0000U) | ((unsigned int)rlo >> 16);
248 #else
249  asm ("prmt.b32 %0,%1,%2,0x7632;" : "=r"(r) : "r"(rlo), "r"(rhi));
250 #endif /* __CUDA_ARCH__ < 200 */
251 #endif /* __CUDA_ARCH__ >= 300 */
252  return r; // halfword-wise sum with signed saturation
253 }
254 
255 static __device__ __forceinline__ unsigned int vaddus2 (unsigned int a, unsigned int b)
256 {
257  unsigned int r;
258 #if __CUDA_ARCH__ >= 300
259  unsigned int c = 0;
260  asm ("vadd2.u32.u32.u32.sat %0,%1,%2,%3;" : "=r"(r):"r"(a),"r"(b),"r"(c));
261 #else /* __CUDA_ARCH__ >= 300 */
262  int alo, blo, rlo, ahi, bhi, rhi;
263  asm ("{ \n\t"
264  "and.b32 %0, %4, 0xffff; \n\t"
265  "and.b32 %1, %5, 0xffff; \n\t"
266 #if __CUDA_ARCH__ < 350
267  "shr.u32 %2, %4, 16; \n\t"
268  "shr.u32 %3, %5, 16; \n\t"
269 #else /* __CUDA_ARCH__ < 350 */
270  "shf.r.clamp.b32 %2, %4, 0, 16;\n\t"
271  "shf.r.clamp.b32 %3, %5, 0, 16;\n\t"
272 #endif /* __CUDA_ARCH__ < 350 */
273  "}"
274  : "=r"(alo), "=r"(blo), "=r"(ahi), "=r"(bhi)
275  : "r"(a), "r"(b));
276  rlo = min (alo + blo, 65535);
277  rhi = min (ahi + bhi, 65535);
278  r = (rhi << 16) + rlo;
279 #endif /* __CUDA_ARCH__ >= 300 */
280  return r; // halfword-wise sum with unsigned saturation
281 }
282 
283 static __device__ __forceinline__ unsigned int vavgs2(unsigned int a, unsigned int b)
284 {
285  unsigned int r;
286 #if __CUDA_ARCH__ >= 300
287  unsigned int c = 0;
288  asm ("vavrg2.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
289 #else /* __CUDA_ARCH__ >= 300 */
290  // avgs (a + b) = ((a + b) < 0) ? ((a + b) >> 1) : ((a + b + 1) >> 1). The
291  // two expressions can be re-written as follows to avoid needing additional
292  // intermediate bits: ((a + b) >> 1) = (a >> 1) + (b >> 1) + ((a & b) & 1),
293  // ((a + b + 1) >> 1) = (a >> 1) + (b >> 1) + ((a | b) & 1). The difference
294  // between the two is ((a ^ b) & 1). Note that if (a + b) < 0, then also
295  // ((a + b) >> 1) < 0, since right shift rounds to negative infinity. This
296  // means we can compute ((a + b) >> 1) then conditionally add ((a ^ b) & 1)
297  // depending on the sign bit of the shifted sum. By handling the msb sum
298  // bit of the result separately, we avoid carry-out during summation and
299  // also can use (potentially faster) logical right shifts.
300  asm ("{ \n\t"
301  ".reg .u32 a,b,c,r,s,t,u,v;\n\t"
302  "mov.b32 a,%1; \n\t"
303  "mov.b32 b,%2; \n\t"
304  "and.b32 u,a,0xfffefffe;\n\t" // prevent shift crossing chunk boundary
305  "and.b32 v,b,0xfffefffe;\n\t" // prevent shift crossing chunk boundary
306  "xor.b32 s,a,b; \n\t" // a ^ b
307  "and.b32 t,a,b; \n\t" // a & b
308  "shr.u32 u,u,1; \n\t" // a >> 1
309  "shr.u32 v,v,1; \n\t" // b >> 1
310  "and.b32 c,s,0x00010001;\n\t" // (a ^ b) & 1
311  "and.b32 s,s,0x80008000;\n\t" // extract msb (a ^ b)
312  "and.b32 t,t,0x00010001;\n\t" // (a & b) & 1
313  "add.u32 r,u,v; \n\t" // (a>>1)+(b>>1)
314  "add.u32 r,r,t; \n\t" // (a>>1)+(b>>1)+(a&b&1); rec. msb cy-in
315  "xor.b32 r,r,s; \n\t" // compute msb sum bit: a ^ b ^ cy-in
316  "shr.u32 t,r,15; \n\t" // sign ((a + b) >> 1)
317  "not.b32 t,t; \n\t" // ~sign ((a + b) >> 1)
318  "and.b32 t,t,c; \n\t" // ((a ^ b) & 1) & ~sign ((a + b) >> 1)
319  "add.u32 r,r,t; \n\t" // conditionally add ((a ^ b) & 1)
320  "mov.b32 %0,r; \n\t"
321  "}"
322  : "=r"(r) : "r"(a), "r"(b));
323 #endif /* __CUDA_ARCH__ >= 300 */
324  return r; // halfword-wise average of signed integers
325 }
326 
327 static __device__ __forceinline__ unsigned int vavgu2(unsigned int a, unsigned int b)
328 {
329  unsigned int r, c;
330 #if __CUDA_ARCH__ >= 300
331  c = 0;
332  asm ("vavrg2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
333 #else /* __CUDA_ARCH__ >= 300 */
334  // HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==>
335  // (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1)
336  c = a ^ b;
337  r = a | b;
338  c = c & 0xfffefffe; // ensure shift doesn't cross half-word boundaries
339  c = c >> 1;
340  r = r - c;
341 #endif /* __CUDA_ARCH__ >= 300 */
342  return r; // halfword-wise average of unsigned integers
343 }
344 
345 static __device__ __forceinline__ unsigned int vhaddu2(unsigned int a, unsigned int b)
346 {
347  // HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==>
348  // (a + b) / 2 = (a & b) + ((a ^ b) >> 1)
349  unsigned int r, s;
350  s = a ^ b;
351  r = a & b;
352  s = s & 0xfffefffe; // ensure shift doesn't cross halfword boundaries
353  s = s >> 1;
354  r = r + s;
355  return r; // halfword-wise average of unsigned ints, rounded down
356 }
357 
358 static __device__ __forceinline__ unsigned int vcmpeq2(unsigned int a, unsigned int b)
359 {
360  unsigned int r, c;
361 #if __CUDA_ARCH__ >= 300
362  c = 0;
363  asm ("vset2.u32.u32.eq %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
364  c = r << 16; // convert bool
365  r = c - r; // into mask
366 #else /* __CUDA_ARCH__ >= 300 */
367  // inspired by Alan Mycroft's null-byte detection algorithm:
368  // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
369  r = a ^ b; // 0x0000 if a == b
370  c = r | 0x80008000; // set msbs, to catch carry out
371  r = r ^ c; // extract msbs, msb = 1 if r < 0x8000
372  c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
373  c = r & ~c; // msb = 1, if r was 0x0000
374 #if __CUDA_ARCH__ >= 200
375  asm ("prmt.b32 %0,%1,0,0xbb99;" : "=r"(r) : "r"(c));// convert msbs to mask
376 #else /* __CUDA_ARCH__ >= 200 */
377  asm ("shr.u32 %0,%1,15;" : "=r"(r) : "r"(c)); // convert
378  asm ("sub.u32 %0,%1,%0;" : "+r"(r) : "r"(c)); // msbs to
379  asm ("or.b32 %0,%1,%0;" : "+r"(r) : "r"(c)); // mask
380 #endif /* __CUDA_ARCH__ >= 200 */
381 #endif /* __CUDA_ARCH__ >= 300 */
382  return r; // halfword-wise (un)signed eq comparison, mask result
383 }
384 
385 static __device__ __forceinline__ unsigned int vcmpges2(unsigned int a, unsigned int b)
386 {
387  unsigned int r;
388 #if __CUDA_ARCH__ >= 300
389  unsigned int c = 0;
390  asm ("vset2.s32.s32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
391  c = r << 16; // convert bool
392  r = c - r; // to mask
393 #else /* __CUDA_ARCH__ >= 300 */
394  asm ("{ \n\t"
395  ".reg .u32 a, b, r, s, t, u; \n\t"
396  "mov.b32 a,%1; \n\t"
397  "mov.b32 b,%2; \n\t"
398  "and.b32 s,a,0xffff0000;\n\t" // high word of a
399  "and.b32 t,b,0xffff0000;\n\t" // high word of b
400  "set.ge.s32.s32 u,s,t; \n\t" // compare two high words
401  "cvt.s32.s16 s,a; \n\t" // sign-extend low word of a
402  "cvt.s32.s16 t,b; \n\t" // sign-extend low word of b
403  "set.ge.s32.s32 s,s,t; \n\t" // compare two low words
404 #if __CUDA_ARCH__ >= 200
405  "prmt.b32 r,s,u,0x7632; \n\t" // combine low and high results
406 #else /* __CUDA_ARCH__ >= 200 */
407  "and.b32 u,u,0xffff0000;\n\t" // mask comparison result hi word
408  "and.b32 s,s,0x0000ffff;\n\t" // mask comparison result lo word
409  "or.b32 r,s,u; \n\t" // combine the two results
410 #endif /* __CUDA_ARCH__ >= 200 */
411  "mov.b32 %0,r; \n\t"
412  "}"
413  : "=r"(r) : "r"(a), "r"(b));
414 #endif /* __CUDA_ARCH__ >= 300 */
415  return r; // halfword-wise signed gt-eq comparison, mask result
416 }
417 
418 static __device__ __forceinline__ unsigned int vcmpgeu2(unsigned int a, unsigned int b)
419 {
420  unsigned int r, c;
421 #if __CUDA_ARCH__ >= 300
422  c = 0;
423  asm ("vset2.u32.u32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
424  c = r << 16; // convert bool
425  r = c - r; // into mask
426 #else /* __CUDA_ARCH__ >= 300 */
427  asm ("not.b32 %0,%0;" : "+r"(b));
428  c = vavgu2 (a, b); // (a + ~b + 1) / 2 = (a - b) / 2
429 #if __CUDA_ARCH__ >= 200
430  asm ("prmt.b32 %0,%1,0,0xbb99;" : "=r"(r) : "r"(c));// build mask from msbs
431 #else /* __CUDA_ARCH__ >= 200 */
432  asm ("and.b32 %0,%0,0x80008000;" : "+r"(c)); // msb = carry-outs
433  asm ("shr.u32 %0,%1,15;" : "=r"(r) : "r"(c)); // build mask
434  asm ("sub.u32 %0,%1,%0;" : "+r"(r) : "r"(c)); // from
435  asm ("or.b32 %0,%1,%0;" : "+r"(r) : "r"(c)); // msbs
436 #endif /* __CUDA_ARCH__ >= 200 */
437 #endif /* __CUDA_ARCH__ >= 300 */
438  return r; // halfword-wise unsigned gt-eq comparison, mask result
439 }
440 
441 static __device__ __forceinline__ unsigned int vcmpgts2(unsigned int a, unsigned int b)
442 {
443  unsigned int r;
444 #if __CUDA_ARCH__ >= 300
445  unsigned int c = 0;
446  asm ("vset2.s32.s32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
447  c = r << 16; // convert bool
448  r = c - r; // to mask
449 #else /* __CUDA_ARCH__ >= 300 */
450  asm ("{ \n\t"
451  ".reg .u32 a, b, r, s, t, u; \n\t"
452  "mov.b32 a,%1; \n\t"
453  "mov.b32 b,%2; \n\t"
454  "and.b32 s,a,0xffff0000;\n\t" // high word of a
455  "and.b32 t,b,0xffff0000;\n\t" // high word of b
456  "set.gt.s32.s32 u,s,t; \n\t" // compare two high words
457  "cvt.s32.s16 s,a; \n\t" // sign-extend low word of a
458  "cvt.s32.s16 t,b; \n\t" // sign-extend low word of b
459  "set.gt.s32.s32 s,s,t; \n\t" // compare two low words
460 #if __CUDA_ARCH__ >= 200
461  "prmt.b32 r,s,u,0x7632; \n\t" // combine low and high results
462 #else /* __CUDA_ARCH__ >= 200 */
463  "and.b32 u,u,0xffff0000;\n\t" // mask comparison result hi word
464  "and.b32 s,s,0x0000ffff;\n\t" // mask comparison result lo word
465  "or.b32 r,s,u; \n\t" // combine the two results
466 #endif /* __CUDA_ARCH__ >= 200 */
467  "mov.b32 %0,r; \n\t"
468  "}"
469  : "=r"(r) : "r"(a), "r"(b));
470 #endif /* __CUDA_ARCH__ >= 300 */
471  return r; // halfword-wise signed gt comparison with mask result
472 }
473 
474 static __device__ __forceinline__ unsigned int vcmpgtu2(unsigned int a, unsigned int b)
475 {
476  unsigned int r, c;
477 #if __CUDA_ARCH__ >= 300
478  c = 0;
479  asm ("vset2.u32.u32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
480  c = r << 16; // convert bool
481  r = c - r; // into mask
482 #else /* __CUDA_ARCH__ >= 300 */
483  asm ("not.b32 %0,%0;" : "+r"(b));
484  c = vhaddu2 (a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
485 #if __CUDA_ARCH__ >= 200
486  asm ("prmt.b32 %0,%1,0,0xbb99;" : "=r"(r) : "r"(c));// build mask from msbs
487 #else /* __CUDA_ARCH__ >= 200 */
488  asm ("and.b32 %0,%0,0x80008000;" : "+r"(c)); // msb = carry-outs
489  asm ("shr.u32 %0,%1,15;" : "=r"(r) : "r"(c)); // build mask
490  asm ("sub.u32 %0,%1,%0;" : "+r"(r) : "r"(c)); // from
491  asm ("or.b32 %0,%1,%0;" : "+r"(r) : "r"(c)); // msbs
492 #endif /* __CUDA_ARCH__ >= 200 */
493 #endif /* __CUDA_ARCH__ >= 300 */
494  return r; // halfword-wise unsigned gt comparison, mask result
495 }
496 
497 static __device__ __forceinline__ unsigned int vcmples2(unsigned int a, unsigned int b)
498 {
499  unsigned int r;
500 #if __CUDA_ARCH__ >= 300
501  unsigned int c = 0;
502  asm ("vset2.s32.s32.le %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
503  c = r << 16; // convert bool
504  r = c - r; // to mask
505 #else /* __CUDA_ARCH__ >= 300 */
506  asm ("{ \n\t"
507  ".reg .u32 a, b, r, s, t, u; \n\t"
508  "mov.b32 a,%1; \n\t"
509  "mov.b32 b,%2; \n\t"
510  "and.b32 s,a,0xffff0000;\n\t" // high word of a
511  "and.b32 t,b,0xffff0000;\n\t" // high word of b
512  "set.le.s32.s32 u,s,t; \n\t" // compare two high words
513  "cvt.s32.s16 s,a; \n\t" // sign-extend low word of a
514  "cvt.s32.s16 t,b; \n\t" // sign-extend low word of b
515  "set.le.s32.s32 s,s,t; \n\t" // compare two low words
516 #if __CUDA_ARCH__ >= 200
517  "prmt.b32 r,s,u,0x7632; \n\t" // combine low and high results
518 #else /* __CUDA_ARCH__ >= 200 */
519  "and.b32 u,u,0xffff0000;\n\t" // mask comparison result hi word
520  "and.b32 s,s,0x0000ffff;\n\t" // mask comparison result lo word
521  "or.b32 r,s,u; \n\t" // combine the two results
522 #endif /* __CUDA_ARCH__ >= 200 */
523  "mov.b32 %0,r; \n\t"
524  "}"
525  : "=r"(r) : "r"(a), "r"(b));
526 #endif /* __CUDA_ARCH__ >= 300 */
527  return r; // halfword-wise signed lt-eq comparison, mask result
528 }
529 
530 static __device__ __forceinline__ unsigned int vcmpleu2(unsigned int a, unsigned int b)
531 {
532  unsigned int r, c;
533 #if __CUDA_ARCH__ >= 300
534  c = 0;
535  asm ("vset2.u32.u32.le %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
536  c = r << 16; // convert bool
537  r = c - r; // into mask
538 #else /* __CUDA_ARCH__ >= 300 */
539  asm ("not.b32 %0,%0;" : "+r"(a));
540  c = vavgu2 (a, b); // (b + ~a + 1) / 2 = (b - a) / 2
541 #if __CUDA_ARCH__ >= 200
542  asm ("prmt.b32 %0,%1,0,0xbb99;" : "=r"(r) : "r"(c));// build mask from msbs
543 #else /* __CUDA_ARCH__ >= 200 */
544  asm ("and.b32 %0,%0,0x80008000;" : "+r"(c)); // msb = carry-outs
545  asm ("shr.u32 %0,%1,15;" : "=r"(r) : "r"(c)); // build mask
546  asm ("sub.u32 %0,%1,%0;" : "+r"(r) : "r"(c)); // from
547  asm ("or.b32 %0,%1,%0;" : "+r"(r) : "r"(c)); // msbs
548 #endif /* __CUDA_ARCH__ >= 200 */
549 #endif /* __CUDA_ARCH__ >= 300 */
550  return r; // halfword-wise unsigned lt-eq comparison, mask result
551 }
552 
553 static __device__ __forceinline__ unsigned int vcmplts2(unsigned int a, unsigned int b)
554 {
555  unsigned int r;
556 #if __CUDA_ARCH__ >= 300
557  unsigned int c = 0;
558  asm ("vset2.s32.s32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
559  c = r << 16; // convert bool
560  r = c - r; // to mask
561 #else /* __CUDA_ARCH__ >= 300 */
562  asm ("{ \n\t"
563  ".reg .u32 a, b, r, s, t, u; \n\t"
564  "mov.b32 a,%1; \n\t"
565  "mov.b32 b,%2; \n\t"
566  "and.b32 s,a,0xffff0000;\n\t" // high word of a
567  "and.b32 t,b,0xffff0000;\n\t" // high word of b
568  "set.lt.s32.s32 u,s,t; \n\t" // compare two high words
569  "cvt.s32.s16 s,a; \n\t" // sign-extend low word of a
570  "cvt.s32.s16 t,b; \n\t" // sign-extend low word of b
571  "set.lt.s32.s32 s,s,t; \n\t" // compare two low words
572 #if __CUDA_ARCH__ >= 200
573  "prmt.b32 r,s,u,0x7632; \n\t" // combine low and high results
574 #else /* __CUDA_ARCH__ >= 200 */
575  "and.b32 u,u,0xffff0000;\n\t" // mask comparison result hi word
576  "and.b32 s,s,0x0000ffff;\n\t" // mask comparison result lo word
577  "or.b32 r,s,u; \n\t" // combine the two results
578 #endif /* __CUDA_ARCH__ >= 200 */
579  "mov.b32 %0,r; \n\t"
580  "}"
581  : "=r"(r) : "r"(a), "r"(b));
582 #endif /* __CUDA_ARCH__ >= 300 */
583  return r; // halfword-wise signed lt comparison with mask result
584 }
585 
586 static __device__ __forceinline__ unsigned int vcmpltu2(unsigned int a, unsigned int b)
587 {
588  unsigned int r, c;
589 #if __CUDA_ARCH__ >= 300
590  c = 0;
591  asm ("vset2.u32.u32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
592  c = r << 16; // convert bool
593  r = c - r; // into mask
594 #else /* __CUDA_ARCH__ >= 300 */
595  asm ("not.b32 %0,%0;" : "+r"(a));
596  c = vhaddu2 (a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
597 #if __CUDA_ARCH__ >= 200
598  asm ("prmt.b32 %0,%1,0,0xbb99;" : "=r"(r) : "r"(c));// build mask from msbs
599 #else /* __CUDA_ARCH__ >= 200 */
600  asm ("and.b32 %0,%0,0x80008000;" : "+r"(c)); // msb = carry-outs
601  asm ("shr.u32 %0,%1,15;" : "=r"(r) : "r"(c)); // build mask
602  asm ("sub.u32 %0,%1,%0;" : "+r"(r) : "r"(c)); // from
603  asm ("or.b32 %0,%1,%0;" : "+r"(r) : "r"(c)); // msbs
604 #endif /* __CUDA_ARCH__ >= 200 */
605 #endif /* __CUDA_ARCH__ >= 300 */
606  return r; // halfword-wise unsigned lt comparison, mask result
607 }
608 
609 static __device__ __forceinline__ unsigned int vcmpne2(unsigned int a, unsigned int b)
610 {
611  unsigned int r, c;
612 #if __CUDA_ARCH__ >= 300
613  c = 0;
614  asm ("vset2.u32.u32.ne %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
615  c = r << 16; // convert bool
616  r = c - r; // into mask
617 #else /* __CUDA_ARCH__ >= 300 */
618  // inspired by Alan Mycroft's null-byte detection algorithm:
619  // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
620  r = a ^ b; // 0x0000 if a == b
621  c = r | 0x80008000; // set msbs, to catch carry out
622  c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
623  c = r | c; // msb = 1, if r was not 0x0000
624 #if __CUDA_ARCH__ >= 200
625  asm ("prmt.b32 %0,%1,0,0xbb99;" : "=r"(r) : "r"(c));// build mask from msbs
626 #else /* __CUDA_ARCH__ >= 200 */
627  asm ("and.b32 %0,%0,0x80008000;" : "+r"(c)); // extract msbs
628  asm ("shr.u32 %0,%1,15;" : "=r"(r) : "r"(c)); // build mask
629  asm ("sub.u32 %0,%1,%0;" : "+r"(r) : "r"(c)); // from
630  asm ("or.b32 %0,%1,%0;" : "+r"(r) : "r"(c)); // msbs
631 #endif /* __CUDA_ARCH__ >= 200 */
632 #endif /* __CUDA_ARCH__ >= 300 */
633  return r; // halfword-wise (un)signed ne comparison, mask result
634 }
635 
636 static __device__ __forceinline__ unsigned int vabsdiffu2(unsigned int a, unsigned int b)
637 {
638  unsigned int r, s;
639 #if __CUDA_ARCH__ >= 300
640  s = 0;
641  asm ("vabsdiff2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) :"r"(a),"r"(b),"r"(s));
642 #else /* __CUDA_ARCH__ >= 300 */
643  unsigned int t, u, v;
644  s = a & 0x0000ffff; // extract low halfword
645  r = b & 0x0000ffff; // extract low halfword
646  u = max (r, s); // maximum of low halfwords
647  v = min (r, s); // minimum of low halfwords
648  s = a & 0xffff0000; // extract high halfword
649  r = b & 0xffff0000; // extract high halfword
650  t = max (r, s); // maximum of high halfwords
651  s = min (r, s); // minimum of high halfwords
652  r = u | t; // maximum of both halfwords
653  s = v | s; // minimum of both halfwords
654  r = r - s; // |a - b| = max(a,b) - min(a,b);
655 #endif /* __CUDA_ARCH__ >= 300 */
656  return r; // halfword-wise absolute difference of unsigned ints
657 }
658 
659 static __device__ __forceinline__ unsigned int vmaxs2(unsigned int a, unsigned int b)
660 {
661  unsigned int r, s;
662 #if __CUDA_ARCH__ >= 300
663  s = 0;
664  asm ("vmax2.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(s));
665 #else /* __CUDA_ARCH__ >= 300 */
666  unsigned int t, u;
667  asm ("cvt.s32.s16 %0,%1;" : "=r"(r) : "r"(a)); // extract low halfword
668  asm ("cvt.s32.s16 %0,%1;" : "=r"(s) : "r"(b)); // extract low halfword
669  t = max((int)r,(int)s); // maximum of low halfwords
670  r = a & 0xffff0000; // extract high halfword
671  s = b & 0xffff0000; // extract high halfword
672  u = max((int)r,(int)s); // maximum of high halfwords
673  r = u | (t & 0xffff); // combine halfword maximums
674 #endif /* __CUDA_ARCH__ >= 300 */
675  return r; // halfword-wise maximum of signed integers
676 }
677 
678 static __device__ __forceinline__ unsigned int vmaxu2(unsigned int a, unsigned int b)
679 {
680  unsigned int r, s;
681 #if __CUDA_ARCH__ >= 300
682  s = 0;
683  asm ("vmax2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(s));
684 #else /* __CUDA_ARCH__ >= 300 */
685  unsigned int t, u;
686  r = a & 0x0000ffff; // extract low halfword
687  s = b & 0x0000ffff; // extract low halfword
688  t = max (r, s); // maximum of low halfwords
689  r = a & 0xffff0000; // extract high halfword
690  s = b & 0xffff0000; // extract high halfword
691  u = max (r, s); // maximum of high halfwords
692  r = t | u; // combine halfword maximums
693 #endif /* __CUDA_ARCH__ >= 300 */
694  return r; // halfword-wise maximum of unsigned integers
695 }
696 
697 static __device__ __forceinline__ unsigned int vmins2(unsigned int a, unsigned int b)
698 {
699  unsigned int r, s;
700 #if __CUDA_ARCH__ >= 300
701  s = 0;
702  asm ("vmin2.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(s));
703 #else /* __CUDA_ARCH__ >= 300 */
704  unsigned int t, u;
705  asm ("cvt.s32.s16 %0,%1;" : "=r"(r) : "r"(a)); // extract low halfword
706  asm ("cvt.s32.s16 %0,%1;" : "=r"(s) : "r"(b)); // extract low halfword
707  t = min((int)r,(int)s); // minimum of low halfwords
708  r = a & 0xffff0000; // extract high halfword
709  s = b & 0xffff0000; // extract high halfword
710  u = min((int)r,(int)s); // minimum of high halfwords
711  r = u | (t & 0xffff); // combine halfword minimums
712 #endif /* __CUDA_ARCH__ >= 300 */
713  return r; // halfword-wise minimum of signed integers
714 }
715 
716 static __device__ __forceinline__ unsigned int vminu2(unsigned int a, unsigned int b)
717 {
718  unsigned int r, s;
719 #if __CUDA_ARCH__ >= 300
720  s = 0;
721  asm ("vmin2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(s));
722 #else /* __CUDA_ARCH__ >= 300 */
723  unsigned int t, u;
724  r = a & 0x0000ffff; // extract low halfword
725  s = b & 0x0000ffff; // extract low halfword
726  t = min (r, s); // minimum of low halfwords
727  r = a & 0xffff0000; // extract high halfword
728  s = b & 0xffff0000; // extract high halfword
729  u = min (r, s); // minimum of high halfwords
730  r = t | u; // combine halfword minimums
731 #endif /* __CUDA_ARCH__ >= 300 */
732  return r; // halfword-wise minimum of unsigned integers
733 }
734 
735 static __device__ __forceinline__ unsigned int vseteq2(unsigned int a, unsigned int b)
736 {
737  unsigned int r, c;
738 #if __CUDA_ARCH__ >= 300
739  c = 0;
740  asm ("vset2.u32.u32.eq %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
741 #else /* __CUDA_ARCH__ >= 300 */
742  // inspired by Alan Mycroft's null-byte detection algorithm:
743  // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
744  r = a ^ b; // 0x0000 if a == b
745  c = r | 0x80008000; // set msbs, to catch carry out
746  r = r ^ c; // extract msbs, msb = 1 if r < 0x8000
747  c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
748  c = r & ~c; // msb = 1, if r was 0x0000
749  r = c >> 15; // convert to bool
750 #endif /* __CUDA_ARCH__ >= 300 */
751  return r; // halfword-wise (un)signed eq comparison, bool result
752 }
753 
754 static __device__ __forceinline__ unsigned int vsetges2(unsigned int a, unsigned int b)
755 {
756  unsigned int r;
757 #if __CUDA_ARCH__ >= 300
758  unsigned int c = 0;
759  asm ("vset2.s32.s32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
760 #else /* __CUDA_ARCH__ >= 300 */
761  asm ("{ \n\t"
762  ".reg .u32 a, b, r, s, t, u; \n\t"
763  "mov.b32 a,%1; \n\t"
764  "mov.b32 b,%2; \n\t"
765  "and.b32 s,a,0xffff0000;\n\t" // high word of a
766  "and.b32 t,b,0xffff0000;\n\t" // high word of b
767  "set.ge.s32.s32 u,s,t; \n\t" // compare two high words
768  "cvt.s32.s16 s,a; \n\t" // sign-extend low word of a
769  "cvt.s32.s16 t,b; \n\t" // sign-extend low word of b
770  "set.ge.s32.s32 s,s,t; \n\t" // compare two low words
771 #if __CUDA_ARCH__ >= 200
772  "prmt.b32 r,s,u,0x7632; \n\t" // combine low and high results
773  "and.b32 r,r,0x00010001;\n\t" // convert from mask to bool
774 #else /* __CUDA_ARCH__ >= 200 */
775  "and.b32 u,u,0x00010000;\n\t" // extract bool result of hi word
776  "and.b32 s,s,0x00000001;\n\t" // extract bool result of lo word
777  "or.b32 r,s,u; \n\t" // combine the two results
778 #endif /* __CUDA_ARCH__ >= 200 */
779  "mov.b32 %0,r; \n\t"
780  "}"
781  : "=r"(r) : "r"(a), "r"(b));
782 #endif /* __CUDA_ARCH__ >= 300 */
783  return r; // halfword-wise signed gt-eq comparison, bool result
784 }
785 
786 static __device__ __forceinline__ unsigned int vsetgeu2(unsigned int a, unsigned int b)
787 {
788  unsigned int r, c;
789 #if __CUDA_ARCH__ >= 300
790  c = 0;
791  asm ("vset2.u32.u32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
792 #else /* __CUDA_ARCH__ >= 300 */
793  asm ("not.b32 %0,%0;" : "+r"(b));
794  c = vavgu2 (a, b); // (a + ~b + 1) / 2 = (a - b) / 2
795  c = c & 0x80008000; // msb = carry-outs
796  r = c >> 15; // convert to bool
797 #endif /* __CUDA_ARCH__ >= 300 */
798  return r; // halfword-wise unsigned gt-eq comparison, bool result
799 }
800 
801 static __device__ __forceinline__ unsigned int vsetgts2(unsigned int a, unsigned int b)
802 {
803  unsigned int r;
804 #if __CUDA_ARCH__ >= 300
805  unsigned int c = 0;
806  asm ("vset2.s32.s32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
807 #else /* __CUDA_ARCH__ >= 300 */
808  asm ("{ \n\t"
809  ".reg .u32 a, b, r, s, t, u; \n\t"
810  "mov.b32 a,%1; \n\t"
811  "mov.b32 b,%2; \n\t"
812  "and.b32 s,a,0xffff0000;\n\t" // high word of a
813  "and.b32 t,b,0xffff0000;\n\t" // high word of b
814  "set.gt.s32.s32 u,s,t; \n\t" // compare two high words
815  "cvt.s32.s16 s,a; \n\t" // sign-extend low word of a
816  "cvt.s32.s16 t,b; \n\t" // sign-extend low word of b
817  "set.gt.s32.s32 s,s,t; \n\t" // compare two low words
818 #if __CUDA_ARCH__ >= 200
819  "prmt.b32 r,s,u,0x7632; \n\t" // combine low and high results
820  "and.b32 r,r,0x00010001;\n\t" // convert from mask to bool
821 #else /* __CUDA_ARCH__ >= 200 */
822  "and.b32 u,u,0x00010000;\n\t" // extract bool result of hi word
823  "and.b32 s,s,0x00000001;\n\t" // extract bool result of lo word
824  "or.b32 r,s,u; \n\t" // combine the two results
825 #endif /* __CUDA_ARCH__ >= 200 */
826  "mov.b32 %0,r; \n\t"
827  "}"
828  : "=r"(r) : "r"(a), "r"(b));
829 #endif /* __CUDA_ARCH__ >= 300 */
830  return r; // halfword-wise signed gt comparison with bool result
831 }
832 
833 static __device__ __forceinline__ unsigned int vsetgtu2(unsigned int a, unsigned int b)
834 {
835  unsigned int r, c;
836 #if __CUDA_ARCH__ >= 300
837  c = 0;
838  asm ("vset2.u32.u32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
839 #else /* __CUDA_ARCH__ >= 300 */
840  asm ("not.b32 %0,%0;" : "+r"(b));
841  c = vhaddu2 (a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
842  c = c & 0x80008000; // msbs = carry-outs
843  r = c >> 15; // convert to bool
844 #endif /* __CUDA_ARCH__ >= 300 */
845  return r; // halfword-wise unsigned gt comparison, bool result
846 }
847 
848 static __device__ __forceinline__ unsigned int vsetles2(unsigned int a, unsigned int b)
849 {
850  unsigned int r;
851 #if __CUDA_ARCH__ >= 300
852  unsigned int c = 0;
853  asm ("vset2.s32.s32.le %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
854 #else /* __CUDA_ARCH__ >= 300 */
855  asm ("{ \n\t"
856  ".reg .u32 a, b, r, s, t, u; \n\t"
857  "mov.b32 a,%1; \n\t"
858  "mov.b32 b,%2; \n\t"
859  "and.b32 s,a,0xffff0000;\n\t" // high word of a
860  "and.b32 t,b,0xffff0000;\n\t" // high word of b
861  "set.le.s32.s32 u,s,t; \n\t" // compare two high words
862  "cvt.s32.s16 s,a; \n\t" // sign-extend low word of a
863  "cvt.s32.s16 t,b; \n\t" // sign-extend low word of b
864  "set.le.s32.s32 s,s,t; \n\t" // compare two low words
865 #if __CUDA_ARCH__ >= 200
866  "prmt.b32 r,s,u,0x7632; \n\t" // combine low and high results
867  "and.b32 r,r,0x00010001;\n\t" // convert from mask to bool
868 #else /* __CUDA_ARCH__ >= 200 */
869  "and.b32 u,u,0x00010000;\n\t" // extract bool result of hi word
870  "and.b32 s,s,0x00000001;\n\t" // extract bool result of lo word
871  "or.b32 r,s,u; \n\t" // combine the two results
872 #endif /* __CUDA_ARCH__ >= 200 */
873  "mov.b32 %0,r; \n\t"
874  "}"
875  : "=r"(r) : "r"(a), "r"(b));
876 #endif /* __CUDA_ARCH__ >= 300 */
877  return r; // halfword-wise signed lt-eq comparison, bool result
878 }
879 
880 static __device__ __forceinline__ unsigned int vsetleu2(unsigned int a, unsigned int b)
881 {
882  unsigned int r, c;
883 #if __CUDA_ARCH__ >= 300
884  c = 0;
885  asm ("vset2.u32.u32.le %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
886 #else /* __CUDA_ARCH__ >= 300 */
887  asm ("not.b32 %0,%0;" : "+r"(a));
888  c = vavgu2 (a, b); // (b + ~a + 1) / 2 = (b - a) / 2
889  c = c & 0x80008000; // msb = carry-outs
890  r = c >> 15; // convert to bool
891 #endif /* __CUDA_ARCH__ >= 300 */
892  return r; // halfword-wise unsigned lt-eq comparison, bool result
893 }
894 
895 static __device__ __forceinline__ unsigned int vsetlts2(unsigned int a, unsigned int b)
896 {
897  unsigned int r;
898 #if __CUDA_ARCH__ >= 300
899  unsigned int c = 0;
900  asm ("vset2.s32.s32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
901 #else /* __CUDA_ARCH__ >= 300 */
902  asm ("{ \n\t"
903  ".reg .u32 a, b, r, s, t, u; \n\t"
904  "mov.b32 a,%1; \n\t"
905  "mov.b32 b,%2; \n\t"
906  "and.b32 s,a,0xffff0000;\n\t" // high word of a
907  "and.b32 t,b,0xffff0000;\n\t" // high word of b
908  "set.lt.s32.s32 u,s,t; \n\t" // compare two high words
909  "cvt.s32.s16 s,a; \n\t" // sign-extend low word of a
910  "cvt.s32.s16 t,b; \n\t" // sign-extend low word of b
911  "set.lt.s32.s32 s,s,t; \n\t" // compare two low words
912 #if __CUDA_ARCH__ >= 200
913  "prmt.b32 r,s,u,0x7632; \n\t" // combine low and high results
914  "and.b32 r,r,0x00010001;\n\t" // convert from mask to bool
915 #else /* __CUDA_ARCH__ >= 200 */
916  "and.b32 u,u,0x00010000;\n\t" // extract bool result of hi word
917  "and.b32 s,s,0x00000001;\n\t" // extract bool result of lo word
918  "or.b32 r,s,u; \n\t" // combine the two results
919 #endif /* __CUDA_ARCH__ >= 200 */
920  "mov.b32 %0,r; \n\t"
921  "}"
922  : "=r"(r) : "r"(a), "r"(b));
923 #endif /* __CUDA_ARCH__ >= 300 */
924  return r; // halfword-wise signed lt comparison with bool result
925 }
926 
927 static __device__ __forceinline__ unsigned int vsetltu2(unsigned int a, unsigned int b)
928 {
929  unsigned int r, c;
930 #if __CUDA_ARCH__ >= 300
931  c = 0;
932  asm ("vset2.u32.u32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
933 #else /* __CUDA_ARCH__ >= 300 */
934  asm ("not.b32 %0,%0;" : "+r"(a));
935  c = vhaddu2 (a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
936  c = c & 0x80008000; // msb = carry-outs
937  r = c >> 15; // convert to bool
938 #endif /* __CUDA_ARCH__ >= 300 */
939  return r; // halfword-wise unsigned lt comparison, bool result
940 }
941 
942 static __device__ __forceinline__ unsigned int vsetne2(unsigned int a, unsigned int b)
943 {
944  unsigned int r, c;
945 #if __CUDA_ARCH__ >= 300
946  c = 0;
947  asm ("vset2.u32.u32.ne %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
948 #else /* __CUDA_ARCH__ >= 300 */
949  // inspired by Alan Mycroft's null-byte detection algorithm:
950  // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
951  r = a ^ b; // 0x0000 if a == b
952  c = r | 0x80008000; // set msbs, to catch carry out
953  c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
954  c = r | c; // msb = 1, if r was not 0x0000
955  c = c & 0x80008000; // extract msbs
956  r = c >> 15; // convert to bool
957 #endif /* __CUDA_ARCH__ >= 300 */
958  return r; // halfword-wise (un)signed ne comparison, bool result
959 }
960 
961 static __device__ __forceinline__ unsigned int vsadu2(unsigned int a, unsigned int b)
962 {
963  unsigned int r, s;
964 #if __CUDA_ARCH__ >= 300
965  s = 0;
966  asm("vabsdiff2.u32.u32.u32.add %0,%1,%2,%3;":"=r"(r):"r"(a),"r"(b),"r"(s));
967 #else /* __CUDA_ARCH__ >= 300 */
968  unsigned int t, u, v;
969  s = a & 0x0000ffff; // extract low halfword
970  r = b & 0x0000ffff; // extract low halfword
971  u = max (r, s); // maximum of low halfwords
972  v = min (r, s); // minimum of low halfwords
973  s = a & 0xffff0000; // extract high halfword
974  r = b & 0xffff0000; // extract high halfword
975  t = max (r, s); // maximum of high halfwords
976  s = min (r, s); // minimum of high halfwords
977  u = u - v; // low halfword: |a - b| = max(a,b) - min(a,b);
978  t = t - s; // high halfword: |a - b| = max(a,b) - min(a,b);
979 #if __CUDA_ARCH__ < 350
980  asm ("shr.u32 %0,%0,16;" : "+r"(t));
981 #else /*__CUDA_ARCH__ < 350 */
982  asm ("shf.r.clamp.b32 %0,%0,0,16;" : "+r"(t));
983 #endif /*__CUDA_ARCH__ < 350 */
984  r = t + u; // sum absolute halfword differences
985 #endif /* __CUDA_ARCH__ >= 300 */
986  return r; // halfword-wise sum of abs differences of unsigned int
987 }
988 
989 static __device__ __forceinline__ unsigned int vsub2(unsigned int a, unsigned int b)
990 {
991  unsigned int s, t;
992 #if __CUDA_ARCH__ >= 300
993  s = 0;
994  asm ("vsub2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(t) : "r"(a), "r"(b), "r"(s));
995 #else /* __CUDA_ARCH__ >= 300 */
996  s = a ^ b; // sum bits
997  t = a - b; // actual sum
998  s = s ^ t; // determine carry-ins for each bit position
999  s = s & 0x00010000; // borrow to high word
1000  t = t + s; // compensate for borrow from low word
1001 #endif /* __CUDA_ARCH__ >= 300 */
1002  return t; // halfword-wise difference
1003 }
1004 
1005 static __device__ __forceinline__ unsigned int vsubss2 (unsigned int a, unsigned int b)
1006 {
1007  unsigned int r;
1008 #if __CUDA_ARCH__ >= 300
1009  unsigned int c = 0;
1010  asm ("vsub2.s32.s32.s32.sat %0,%1,%2,%3;" : "=r"(r):"r"(a),"r"(b),"r"(c));
1011 #else /* __CUDA_ARCH__ >= 300 */
1012  int ahi, alo, blo, bhi, rhi, rlo;
1013  ahi = (int)((a & 0xffff0000U));
1014  bhi = (int)((b & 0xffff0000U));
1015 #if __CUDA_ARCH__ < 200
1016  alo = (int)(a << 16);
1017  blo = (int)(b << 16);
1018 #elif __CUDA_ARCH__ < 350
1019  // work around (we would want left shifts at least for sm_2x)
1020  asm ("prmt.b32 %0,%1,0,0x1044;" : "=r"(alo) : "r"(a));
1021  asm ("prmt.b32 %0,%1,0,0x1044;" : "=r"(blo) : "r"(b));
1022 #else /* __CUDA_ARCH__ < 350 */
1023  asm ("shf.l.clamp.b32 %0,0,%1,16;" : "=r"(alo) : "r"(a));
1024  asm ("shf.l.clamp.b32 %0,0,%1,16;" : "=r"(blo) : "r"(b));
1025 #endif /* __CUDA_ARCH__ < 350 */
1026  asm ("sub.sat.s32 %0,%1,%2;" : "=r"(rlo) : "r"(alo), "r"(blo));
1027  asm ("sub.sat.s32 %0,%1,%2;" : "=r"(rhi) : "r"(ahi), "r"(bhi));
1028 #if __CUDA_ARCH__ < 200
1029  r = ((unsigned int)rhi & 0xffff0000U) | ((unsigned int)rlo >> 16);
1030 #else /* __CUDA_ARCH__ < 200 */
1031  asm ("prmt.b32 %0,%1,%2,0x7632;" : "=r"(r) : "r"(rlo), "r"(rhi));
1032 #endif /* __CUDA_ARCH__ < 200 */
1033 #endif /* __CUDA_ARCH__ >= 300 */
1034  return r; // halfword-wise difference with signed saturation
1035 }
1036 
1037 static __device__ __forceinline__ unsigned int vsubus2 (unsigned int a, unsigned int b)
1038 {
1039  unsigned int r;
1040 #if __CUDA_ARCH__ >= 300
1041  unsigned int c = 0;
1042  asm ("vsub2.u32.u32.u32.sat %0,%1,%2,%3;" : "=r"(r):"r"(a),"r"(b),"r"(c));
1043 #else /* __CUDA_ARCH__ >= 300 */
1044  int alo, blo, rlo, ahi, bhi, rhi;
1045  asm ("{ \n\t"
1046  "and.b32 %0, %4, 0xffff; \n\t"
1047  "and.b32 %1, %5, 0xffff; \n\t"
1048 #if __CUDA_ARCH__ < 350
1049  "shr.u32 %2, %4, 16; \n\t"
1050  "shr.u32 %3, %5, 16; \n\t"
1051 #else /* __CUDA_ARCH__ < 350 */
1052  "shf.r.clamp.b32 %2, %4, 0, 16;\n\t"
1053  "shf.r.clamp.b32 %3, %5, 0, 16;\n\t"
1054 #endif /* __CUDA_ARCH__ < 350 */
1055  "}"
1056  : "=r"(alo), "=r"(blo), "=r"(ahi), "=r"(bhi)
1057  : "r"(a), "r"(b));
1058  rlo = max ((int)(alo - blo), 0);
1059  rhi = max ((int)(ahi - bhi), 0);
1060  r = rhi * 65536 + rlo;
1061 #endif /* __CUDA_ARCH__ >= 300 */
1062  return r; // halfword-wise difference with unsigned saturation
1063 }
1064 
1065 static __device__ __forceinline__ unsigned int vneg2(unsigned int a)
1066 {
1067  return vsub2 (0, a);// halfword-wise negation with wrap-around
1068 }
1069 
1070 static __device__ __forceinline__ unsigned int vnegss2(unsigned int a)
1071 {
1072  return vsubss2(0,a);// halfword-wise negation with signed saturation
1073 }
1074 
1075 static __device__ __forceinline__ unsigned int vabsdiffs2(unsigned int a, unsigned int b)
1076 {
1077  unsigned int r, s;
1078 #if __CUDA_ARCH__ >= 300
1079  s = 0;
1080  asm ("vabsdiff2.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) :"r"(a),"r"(b),"r"(s));
1081 #else /* __CUDA_ARCH__ >= 300 */
1082  s = vcmpges2 (a, b);// mask = 0xff if a >= b
1083  r = a ^ b; //
1084  s = (r & s) ^ b; // select a when a >= b, else select b => max(a,b)
1085  r = s ^ r; // select a when b >= a, else select b => min(a,b)
1086  r = vsub2 (s, r); // |a - b| = max(a,b) - min(a,b);
1087 #endif /* __CUDA_ARCH__ >= 300 */
1088  return r; // halfword-wise absolute difference of signed integers
1089 }
1090 
1091 static __device__ __forceinline__ unsigned int vsads2(unsigned int a, unsigned int b)
1092 {
1093  unsigned int r, s;
1094 #if __CUDA_ARCH__ >= 300
1095  s = 0;
1096  asm("vabsdiff2.s32.s32.s32.add %0,%1,%2,%3;":"=r"(r):"r"(a),"r"(b),"r"(s));
1097 #else /* __CUDA_ARCH__ >= 300 */
1098  s = vabsdiffs2 (a, b);
1099  r = (s >> 16) + (s & 0x0000ffff);
1100 #endif /* __CUDA_ARCH__ >= 300 */
1101  return r; // halfword-wise sum of abs. differences of signed ints
1102 }
1103 
1104 static __device__ __forceinline__ unsigned int vabs4(unsigned int a)
1105 {
1106  unsigned int r;
1107 #if __CUDA_ARCH__ >= 300
1108  unsigned int b = 0, c = 0;
1109  asm ("vabsdiff4.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) :"r"(a),"r"(b),"r"(c));
1110 #elif __CUDA_ARCH__ >= 200
1111  asm ("{ \n\t"
1112  ".reg .u32 a,m,r; \n\t"
1113  "mov.b32 a,%1; \n\t"
1114  "prmt.b32 m,a,0,0xba98; \n\t" // msb ? 0xff : 00
1115  "xor.b32 r,a,m; \n\t" // conditionally invert bits
1116  "and.b32 m,m,0x01010101;\n\t" // msb ? 0x1 : 0
1117  "add.u32 r,r,m; \n\t" // conditionally add 1
1118  "mov.b32 %0,r; \n\t"
1119  "}"
1120  : "=r"(r) : "r"(a));
1121 #else /* __CUDA_ARCH__ >= 200 */
1122  asm ("{ \n\t"
1123  ".reg .u32 a,m,r,s; \n\t"
1124  "mov.b32 a,%1; \n\t"
1125  "and.b32 m,a,0x80808080;\n\t" // extract msb
1126  "and.b32 r,a,0x7f7f7f7f;\n\t" // clear msb
1127  "shr.u32 s,m,7; \n\t" // build lsb mask
1128  "sub.u32 m,m,s; \n\t" // from msb
1129  "xor.b32 r,r,m; \n\t" // conditionally invert lsbs
1130  "add.u32 r,r,s; \n\t" // conditionally add 1
1131  "mov.b32 %0,r; \n\t"
1132  "}"
1133  : "=r"(r) : "r"(a));
1134 #endif /* __CUDA_ARCH__ >= 200 */
1135  return r; // byte-wise absolute value, with wrap-around
1136 }
1137 
1138 static __device__ __forceinline__ unsigned int vabsss4(unsigned int a)
1139 {
1140  unsigned int r;
1141 #if __CUDA_ARCH__ >= 300
1142  unsigned int b = 0, c = 0;
1143  asm("vabsdiff4.s32.s32.s32.sat %0,%1,%2,%3;":"=r"(r):"r"(a),"r"(b),"r"(c));
1144 #elif __CUDA_ARCH__ >= 200
1145  asm ("{ \n\t"
1146  ".reg .u32 a,m,r; \n\t"
1147  "mov.b32 a,%1; \n\t"
1148  "prmt.b32 m,a,0,0xba98; \n\t" // msb ? 0xff : 00
1149  "xor.b32 r,a,m; \n\t" // conditionally invert bits
1150  "and.b32 m,m,0x01010101;\n\t" // msb ? 0x1 : 0
1151  "add.u32 r,r,m; \n\t" // conditionally add 1
1152  "prmt.b32 m,r,0,0xba98; \n\t" // msb ? 0xff : 00
1153  "and.b32 m,m,0x01010101;\n\t" // msb ? 0x1 : 0
1154  "sub.u32 r,r,m; \n\t" // subtract 1 if result wrapped around
1155  "mov.b32 %0,r; \n\t"
1156  "}"
1157  : "=r"(r) : "r"(a));
1158 #else /* __CUDA_ARCH__ >= 200 */
1159  asm ("{ \n\t"
1160  ".reg .u32 a,m,r,s; \n\t"
1161  "mov.b32 a,%1; \n\t"
1162  "and.b32 m,a,0x80808080;\n\t" // extract msb
1163  "and.b32 r,a,0x7f7f7f7f;\n\t" // clear msb
1164  "shr.u32 s,m,7; \n\t" // build lsb mask
1165  "sub.u32 m,m,s; \n\t" // from msb
1166  "xor.b32 r,r,m; \n\t" // conditionally invert lsbs
1167  "add.u32 r,r,s; \n\t" // conditionally add 1
1168  "and.b32 m,r,0x80808080;\n\t" // extract msb (1 if wrap-around)
1169  "shr.u32 s,m,7; \n\t" // msb ? 1 : 0
1170  "sub.u32 r,r,s; \n\t" // subtract 1 if result wrapped around
1171  "mov.b32 %0,r; \n\t"
1172  "}"
1173  : "=r"(r) : "r"(a));
1174 #endif /* __CUDA_ARCH__ >= 200 */
1175  return r; // byte-wise absolute value with signed saturation
1176 }
1177 
1178 static __device__ __forceinline__ unsigned int vadd4(unsigned int a, unsigned int b)
1179 {
1180 #if __CUDA_ARCH__ >= 300
1181  unsigned int r, c = 0;
1182  asm ("vadd4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
1183 #else /* __CUDA_ARCH__ >= 300 */
1184  unsigned int r, s, t;
1185  s = a ^ b; // sum bits
1186  r = a & 0x7f7f7f7f; // clear msbs
1187  t = b & 0x7f7f7f7f; // clear msbs
1188  s = s & 0x80808080; // msb sum bits
1189  r = r + t; // add without msbs, record carry-out in msbs
1190  r = r ^ s; // sum of msb sum and carry-in bits, w/o carry-out
1191 #endif /* __CUDA_ARCH__ >= 300 */
1192  return r; // byte-wise sum, with wrap-around
1193 }
1194 
1195 static __device__ __forceinline__ unsigned int vaddss4 (unsigned int a, unsigned int b)
1196 {
1197 #if __CUDA_ARCH__ >= 300
1198  unsigned int r, c = 0;
1199  asm ("vadd4.sat.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r):"r"(a),"r"(b),"r"(c));
1200 #else /* __CUDA_ARCH__ >= 300 */
1201  /*
1202  For signed saturation, saturation is controlled by the overflow signal:
1203  ovfl = (carry-in to msb) XOR (carry-out from msb). Overflow can only
1204  occur when the msbs of both inputs are the same. The defined response to
1205  overflow is to deliver 0x7f when the addends are positive (bit 7 clear),
1206  and 0x80 when the addends are negative (bit 7 set). The truth table for
1207  the msb is
1208 
1209  a b cy_in res cy_out ovfl
1210  --------------------------------
1211  0 0 0 0 0 0
1212  0 0 1 1 0 1
1213  0 1 0 1 0 0
1214  0 1 1 0 1 0
1215  1 0 0 1 0 0
1216  1 0 1 0 1 0
1217  1 1 0 0 1 1
1218  1 1 1 1 1 0
1219 
1220  The seven low-order bits can be handled by simple wrapping addition with
1221  the carry out from bit 6 recorded in the msb (thus corresponding to the
1222  cy_in in the truth table for the msb above). ovfl can be computed in many
1223  equivalent ways, here we use ovfl = (a ^ carry_in) & ~(a ^ b) since we
1224  already need to compute (a ^ b) for the msb sum bit computation. First we
1225  compute the normal, wrapped addition result. When overflow is detected,
1226  we mask off the msb of the result, then compute a mask covering the seven
1227  low order bits, which are all set to 1. This sets the byte to 0x7f as we
1228  previously cleared the msb. In the overflow case, the sign of the result
1229  matches the sign of either of the inputs, so we extract the sign of a and
1230  add it to the low order bits, which turns 0x7f into 0x80, the correct
1231  result for an overflowed negative result.
1232  */
1233  unsigned int r;
1234  asm ("{ \n\t"
1235  ".reg .u32 a,b,r,s,t,u; \n\t"
1236  "mov.b32 a, %1; \n\t"
1237  "mov.b32 b, %2; \n\t"
1238  "and.b32 r, a, 0x7f7f7f7f;\n\t" // clear msbs
1239  "and.b32 t, b, 0x7f7f7f7f;\n\t" // clear msbs
1240  "xor.b32 s, a, b; \n\t" // sum bits = (a ^ b)
1241  "add.u32 r, r, t; \n\t" // capture msb carry-in in bit 7
1242  "xor.b32 t, a, r; \n\t" // a ^ carry_in
1243  "not.b32 u, s; \n\t" // ~(a ^ b)
1244  "and.b32 t, t, u; \n\t" // ovfl = (a ^ carry_in) & ~(a ^ b)
1245  "and.b32 s, s, 0x80808080;\n\t" // msb sum bits
1246  "xor.b32 r, r, s; \n\t" // msb result = (a ^ b ^ carry_in)
1247 #if __CUDA_ARCH__ >= 200
1248  "prmt.b32 s,a,0,0xba98; \n\t" // sign(a) ? 0xff : 0
1249  "xor.b32 s,s,0x7f7f7f7f; \n\t" // sign(a) ? 0x80 : 0x7f
1250  "prmt.b32 t,t,0,0xba98; \n\t" // ovfl ? 0xff : 0
1251  "and.b32 s,s,t; \n\t" // ovfl ? (sign(a) ? 0x80:0x7f) : 0
1252  "not.b32 t,t; \n\t" // ~ovfl
1253  "and.b32 r,r,t; \n\t" // ovfl ? 0 : a + b
1254  "or.b32 r,r,s; \n\t" // ovfl ? (sign(a) ? 0x80:0x7f) : a+b
1255 #else /* __CUDA_ARCH__ >= 200 */
1256  "and.b32 t, t, 0x80808080;\n\t" // ovfl ? 0x80 : 0
1257  "shr.u32 s, t, 7; \n\t" // ovfl ? 1 : 0
1258  "not.b32 u, t; \n\t" // ovfl ? 0x7f : 0xff
1259  "and.b32 r, r, u; \n\t" // ovfl ? (a + b) & 0x7f : a + b
1260  "and.b32 u, a, t; \n\t" // ovfl ? a & 0x80 : 0
1261  "sub.u32 t, t, s; \n\t" // ovfl ? 0x7f : 0
1262  "shr.u32 u, u, 7; \n\t" // ovfl ? sign(a) : 0
1263  "or.b32 r, r, t; \n\t" // ovfl ? 0x7f : a + b
1264  "add.u32 r, r, u; \n\t" // ovfl ? 0x7f+sign(a) : a + b
1265 #endif /* __CUDA_ARCH__ >= 200 */
1266  "mov.b32 %0, r; \n\t"
1267  "}"
1268  : "=r"(r) : "r"(a), "r"(b));
1269 #endif /* __CUDA_ARCH__ >= 300 */
1270  return r; // byte-wise sum with signed saturation
1271 }
1272 
1273 static __device__ __forceinline__ unsigned int vaddus4 (unsigned int a, unsigned int b)
1274 {
1275 #if __CUDA_ARCH__ >= 300
1276  unsigned int r, c = 0;
1277  asm ("vadd4.u32.u32.u32.sat %0,%1,%2,%3;" : "=r"(r):"r"(a),"r"(b),"r"(c));
1278 #else /* __CUDA_ARCH__ >= 300 */
1279  // This code uses the same basic approach used for non-saturating addition.
1280  // The seven low-order bits in each byte are summed by regular addition,
1281  // with the carry-out from bit 6 (= carry-in for the msb) being recorded
1282  // in bit 7, while the msb is handled separately.
1283  //
1284  // The fact that this is a saturating addition simplifies the handling of
1285  // the msb. When carry-out from the msb occurs, the entire byte must be
1286  // written as 0xff, and the computed msb is overwritten in the process.
1287  // The corresponding entries in the truth table for the result msb thus
1288  // become "don't cares":
1289  //
1290  // a b cy-in res cy-out
1291  // ------------------------
1292  // 0 0 0 0 0
1293  // 0 0 1 1 0
1294  // 0 1 0 1 0
1295  // 0 1 1 X 1
1296  // 1 0 0 1 0
1297  // 1 0 1 X 1
1298  // 1 1 0 X 1
1299  // 1 1 1 X 1
1300  //
1301  // As is easily seen, the simplest implementation of the result msb bit is
1302  // simply (a | b | cy-in), with masking needed to isolate the msb. Note
1303  // that this computation also makes the msb handling redundant with the
1304  // clamping to 0xFF, because the msb is already set to 1 when saturation
1305  // occurs. This means we only need to apply saturation to the seven lsb
1306  // bits in each byte, by overwriting with 0x7F. Saturation is controlled
1307  // by carry-out from the msb, which can be represented by various Boolean
1308  // expressions. Since to compute (a | b | cy-in) we need to compute (a | b)
1309  // anyhow, most efficient of these is cy-out = ((a & b) | cy-in) & (a | b).
1310  unsigned int r;
1311  asm ("{ \n\t"
1312  ".reg .u32 a,b,r,s,t,m; \n\t"
1313  "mov.b32 a, %1; \n\t"
1314  "mov.b32 b, %2; \n\t"
1315  "or.b32 m, a, b; \n\t" // (a | b)
1316  "and.b32 r, a, 0x7f7f7f7f;\n\t" // clear msbs
1317  "and.b32 t, b, 0x7f7f7f7f;\n\t" // clear msbs
1318  "and.b32 m, m, 0x80808080;\n\t" // (a | b), isolate msbs
1319  "add.u32 r, r, t; \n\t" // add w/o msbs, record msb-carry-ins
1320  "and.b32 t, a, b; \n\t" // (a & b)
1321  "or.b32 t, t, r; \n\t" // (a & b) | cy-in)
1322  "or.b32 r, r, m; \n\t" // msb = cy-in | (a | b)
1323  "and.b32 t, t, m; \n\t" // cy-out=((a&b)|cy-in)&(a|b),in msbs
1324 #if __CUDA_ARCH__ >= 200
1325  "prmt.b32 t, t, 0, 0xba98; \n\t" // cy-out ? 0xff : 0
1326 #else /* __CUDA_ARCH__ >= 200 */
1327  "shr.u32 s, t, 7; \n\t" // cy-out ? 1 : 0
1328  "sub.u32 t, t, s; \n\t" // lsb-overwrite: cy-out ? 0x7F : 0
1329 #endif /* __CUDA_ARCH__ >= 200 */
1330  "or.b32 r, r, t; \n\t" // conditionally overwrite lsbs
1331  "mov.b32 %0, r; \n\t"
1332  "}"
1333  : "=r"(r) : "r"(a), "r"(b));
1334 #endif /* __CUDA_ARCH__ >= 300 */
1335  return r; // byte-wise sum with unsigned saturation
1336 }
1337 
1338 static __device__ __forceinline__ unsigned int vavgs4(unsigned int a, unsigned int b)
1339 {
1340  unsigned int r;
1341 #if __CUDA_ARCH__ >= 300
1342  unsigned int c = 0;
1343  asm ("vavrg4.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
1344 #else /* __CUDA_ARCH__ >= 300 */
1345  // avgs (a + b) = ((a + b) < 0) ? ((a + b) >> 1) : ((a + b + 1) >> 1). The
1346  // two expressions can be re-written as follows to avoid needing additional
1347  // intermediate bits: ((a + b) >> 1) = (a >> 1) + (b >> 1) + ((a & b) & 1),
1348  // ((a + b + 1) >> 1) = (a >> 1) + (b >> 1) + ((a | b) & 1). The difference
1349  // between the two is ((a ^ b) & 1). Note that if (a + b) < 0, then also
1350  // ((a + b) >> 1) < 0, since right shift rounds to negative infinity. This
1351  // means we can compute ((a + b) >> 1) then conditionally add ((a ^ b) & 1)
1352  // depending on the sign bit of the shifted sum. By handling the msb sum
1353  // bit of the result separately, we avoid carry-out during summation and
1354  // also can use (potentially faster) logical right shifts.
1355  asm ("{ \n\t"
1356  ".reg .u32 a,b,c,r,s,t,u,v;\n\t"
1357  "mov.b32 a,%1; \n\t"
1358  "mov.b32 b,%2; \n\t"
1359  "and.b32 u,a,0xfefefefe;\n\t" // prevent shift crossing chunk boundary
1360  "and.b32 v,b,0xfefefefe;\n\t" // prevent shift crossing chunk boundary
1361  "xor.b32 s,a,b; \n\t" // a ^ b
1362  "and.b32 t,a,b; \n\t" // a & b
1363  "shr.u32 u,u,1; \n\t" // a >> 1
1364  "shr.u32 v,v,1; \n\t" // b >> 1
1365  "and.b32 c,s,0x01010101;\n\t" // (a ^ b) & 1
1366  "and.b32 s,s,0x80808080;\n\t" // extract msb (a ^ b)
1367  "and.b32 t,t,0x01010101;\n\t" // (a & b) & 1
1368  "add.u32 r,u,v; \n\t" // (a>>1)+(b>>1)
1369  "add.u32 r,r,t; \n\t" // (a>>1)+(b>>1)+(a&b&1); rec. msb cy-in
1370  "xor.b32 r,r,s; \n\t" // compute msb sum bit: a ^ b ^ cy-in
1371  "shr.u32 t,r,7; \n\t" // sign ((a + b) >> 1)
1372  "not.b32 t,t; \n\t" // ~sign ((a + b) >> 1)
1373  "and.b32 t,t,c; \n\t" // ((a ^ b) & 1) & ~sign ((a + b) >> 1)
1374  "add.u32 r,r,t; \n\t" // conditionally add ((a ^ b) & 1)
1375  "mov.b32 %0,r; \n\t"
1376  "}"
1377  : "=r"(r) : "r"(a), "r"(b));
1378 #endif /* __CUDA_ARCH__ >= 300 */
1379  return r; // byte-wise average of signed integers
1380 }
1381 
1382 static __device__ __forceinline__ unsigned int vavgu4(unsigned int a, unsigned int b)
1383 {
1384  unsigned int r, c;
1385 #if __CUDA_ARCH__ >= 300
1386  c = 0;
1387  asm ("vavrg4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
1388 #else /* __CUDA_ARCH__ >= 300 */
1389  // HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==>
1390  // (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1)
1391  c = a ^ b;
1392  r = a | b;
1393  c = c & 0xfefefefe; // ensure following shift doesn't cross byte boundaries
1394  c = c >> 1;
1395  r = r - c;
1396 #endif /* __CUDA_ARCH__ >= 300 */
1397  return r; // byte-wise average of unsigned integers
1398 }
1399 
1400 static __device__ __forceinline__ unsigned int vhaddu4(unsigned int a, unsigned int b)
1401 {
1402  // HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==>
1403  // (a + b) / 2 = (a & b) + ((a ^ b) >> 1)
1404  unsigned int r, s;
1405  s = a ^ b;
1406  r = a & b;
1407  s = s & 0xfefefefe; // ensure following shift doesn't cross byte boundaries
1408  s = s >> 1;
1409  s = r + s;
1410  return s; // byte-wise average of unsigned integers, rounded down
1411 }
1412 
1413 static __device__ __forceinline__ unsigned int vcmpeq4(unsigned int a, unsigned int b)
1414 {
1415  unsigned int c, r;
1416 #if __CUDA_ARCH__ >= 300
1417  r = 0;
1418  asm ("vset4.u32.u32.eq %0,%1,%2,%3;" : "=r"(c) : "r"(a), "r"(b), "r"(r));
1419  r = c << 8; // convert bool
1420  r = r - c; // to mask
1421 #else /* __CUDA_ARCH__ >= 300 */
1422  // inspired by Alan Mycroft's null-byte detection algorithm:
1423  // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
1424  r = a ^ b; // 0x00 if a == b
1425  c = r | 0x80808080; // set msbs, to catch carry out
1426  r = r ^ c; // extract msbs, msb = 1 if r < 0x80
1427  c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
1428  c = r & ~c; // msb = 1, if r was 0x00
1429 #if __CUDA_ARCH__ >= 200
1430  asm ("prmt.b32 %0,%1,0,0xba98;" : "=r"(r) : "r"(c));// convert msbs to mask
1431 #else /* __CUDA_ARCH__ >= 200 */
1432  asm ("shr.u32 %0,%1,7;" : "=r"(r) : "r"(c)); // convert
1433  asm ("sub.u32 %0,%1,%0;" : "+r"(r) : "r"(c)); // msbs to
1434  asm ("or.b32 %0,%1,%0;" : "+r"(r) : "r"(c)); // mask
1435 #endif /* __CUDA_ARCH__ >= 200 */
1436 #endif /* __CUDA_ARCH__ >= 300 */
1437  return r; // byte-wise (un)signed eq comparison with mask result
1438 }
1439 
1440 static __device__ __forceinline__ unsigned int vcmpges4(unsigned int a, unsigned int b)
1441 {
1442  unsigned int r;
1443 #if __CUDA_ARCH__ >= 300
1444  unsigned int c = 0;
1445  asm ("vset4.s32.s32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
1446  c = r << 8; // convert bool
1447  r = c - r; // to mask
1448 #else /* __CUDA_ARCH__ >= 300 */
1449  asm ("{ \n\t"
1450  ".reg .u32 a, b, r, s, t, u;\n\t"
1451  "mov.b32 a,%1; \n\t"
1452  "mov.b32 b,%2; \n\t"
1453  "xor.b32 s,a,b; \n\t" // a ^ b
1454  "or.b32 r,a,0x80808080;\n\t" // set msbs
1455  "and.b32 t,b,0x7f7f7f7f;\n\t" // clear msbs
1456  "sub.u32 r,r,t; \n\t" // subtract lsbs, msb: ~borrow-in
1457  "xor.b32 t,r,a; \n\t" // msb: ~borrow-in ^ a
1458  "xor.b32 r,r,s; \n\t" // msb: ~sign(res) = a^b^~borrow-in
1459  "and.b32 t,t,s; \n\t" // msb: ovfl= (~bw-in ^ a) & (a ^ b)
1460  "xor.b32 t,t,r; \n\t" // msb: ge = ovfl != ~sign(res)
1461 #if __CUDA_ARCH__ >= 200
1462  "prmt.b32 r,t,0,0xba98; \n\t" // build mask from msbs
1463 #else /* __CUDA_ARCH__ >= 200 */
1464  "and.b32 t,t,0x80808080;\n\t" // isolate msbs = ovfl
1465  "shr.u32 r,t,7; \n\t" // build mask
1466  "sub.u32 r,t,r; \n\t" // from
1467  "or.b32 r,r,t; \n\t" // msbs
1468 #endif /* __CUDA_ARCH__ >= 200 */
1469  "mov.b32 %0,r; \n\t"
1470  "}"
1471  : "=r"(r) : "r"(a), "r"(b));
1472 #endif /* __CUDA_ARCH__ >= 300 */
1473  return r; // byte-wise signed gt-eq comparison with mask result
1474 }
1475 
1476 static __device__ __forceinline__ unsigned int vcmpgeu4(unsigned int a, unsigned int b)
1477 {
1478  unsigned int r, c;
1479 #if __CUDA_ARCH__ >= 300
1480  c = 0;
1481  asm ("vset4.u32.u32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
1482  c = r << 8; // convert bool
1483  r = c - r; // to mask
1484 #else /* __CUDA_ARCH__ >= 300 */
1485  asm ("not.b32 %0,%0;" : "+r"(b));
1486  c = vavgu4 (a, b); // (a + ~b + 1) / 2 = (a - b) / 2
1487 #if __CUDA_ARCH__ >= 200
1488  asm ("prmt.b32 %0,%1,0,0xba98;" : "=r"(r) : "r"(c));// build mask from msbs
1489 #else /* __CUDA_ARCH__ >= 200 */
1490  asm ("and.b32 %0,%0,0x80808080;" : "+r"(c)); // msb = carry-outs
1491  asm ("shr.u32 %0,%1,7;" : "=r"(r) : "r"(c)); // build mask
1492  asm ("sub.u32 %0,%1,%0;" : "+r"(r) : "r"(c)); // from
1493  asm ("or.b32 %0,%1,%0;" : "+r"(r) : "r"(c)); // msbs
1494 #endif /* __CUDA_ARCH__ >= 200 */
1495 #endif /* __CUDA_ARCH__ >= 300 */
1496  return r; // byte-wise unsigned gt-eq comparison with mask result
1497 }
1498 
1499 static __device__ __forceinline__ unsigned int vcmpgts4(unsigned int a, unsigned int b)
1500 {
1501  unsigned int r;
1502 #if __CUDA_ARCH__ >= 300
1503  unsigned int c = 0;
1504  asm ("vset4.s32.s32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
1505  c = r << 8; // convert bool
1506  r = c - r; // to mask
1507 #else /* __CUDA_ARCH__ >= 300 */
1508  /* a <= b <===> a + ~b < 0 */
1509  asm ("{ \n\t"
1510  ".reg .u32 a,b,r,s,t,u; \n\t"
1511  "mov.b32 a,%1; \n\t"
1512  "mov.b32 b,%2; \n\t"
1513  "not.b32 b,b; \n\t"
1514  "and.b32 r,a,0x7f7f7f7f;\n\t" // clear msbs
1515  "and.b32 t,b,0x7f7f7f7f;\n\t" // clear msbs
1516  "xor.b32 s,a,b; \n\t" // sum bits = (a ^ b)
1517  "add.u32 r,r,t; \n\t" // capture msb carry-in in bit 7
1518  "xor.b32 t,a,r; \n\t" // a ^ carry_in
1519  "not.b32 u,s; \n\t" // ~(a ^ b)
1520  "and.b32 t,t,u; \n\t" // msb: ovfl = (a ^ carry_in) & ~(a^b)
1521  "xor.b32 r,r,u; \n\t" // msb: ~result = (~(a ^ b) ^ carry_in)
1522  "xor.b32 t,t,r; \n\t" // msb: gt = ovfl != sign(~res)
1523 #if __CUDA_ARCH__ >= 200
1524  "prmt.b32 r,t,0,0xba98; \n\t" // build mask from msbs
1525 #else /* __CUDA_ARCH__ >= 200 */
1526  "and.b32 t,t,0x80808080;\n\t" // isolate msbs
1527  "shr.u32 r,t,7; \n\t" // build mask
1528  "sub.u32 r,t,r; \n\t" // from
1529  "or.b32 r,r,t; \n\t" // msbs
1530 #endif /* __CUDA_ARCH__ >= 200 */
1531  "mov.b32 %0,r; \n\t"
1532  "}"
1533  : "=r"(r) : "r"(a), "r"(b));
1534 #endif /* __CUDA_ARCH__ >= 300 */
1535  return r; // byte-wise signed gt comparison with mask result
1536 }
1537 
1538 static __device__ __forceinline__ unsigned int vcmpgtu4(unsigned int a, unsigned int b)
1539 {
1540  unsigned int r, c;
1541 #if __CUDA_ARCH__ >= 300
1542  c = 0;
1543  asm ("vset4.u32.u32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
1544  c = r << 8; // convert bool
1545  r = c - r; // to mask
1546 #else /* __CUDA_ARCH__ >= 300 */
1547  asm ("not.b32 %0,%0;" : "+r"(b));
1548  c = vhaddu4 (a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
1549 #if __CUDA_ARCH__ >= 200
1550  asm ("prmt.b32 %0,%1,0,0xba98;" : "=r"(r) : "r"(c));// build mask from msbs
1551 #else /* __CUDA_ARCH__ >= 200 */
1552  asm ("and.b32 %0,%0,0x80808080;" : "+r"(c)); // msb = carry-outs
1553  asm ("shr.u32 %0,%1,7;" : "=r"(r) : "r"(c)); // build mask
1554  asm ("sub.u32 %0,%1,%0;" : "+r"(r) : "r"(c)); // from
1555  asm ("or.b32 %0,%1,%0;" : "+r"(r) : "r"(c)); // msbs
1556 #endif /* __CUDA_ARCH__ >= 200 */
1557 #endif /* __CUDA_ARCH__ >= 300 */
1558  return r; // byte-wise unsigned gt comparison with mask result
1559 }
1560 
1561 static __device__ __forceinline__ unsigned int vcmples4(unsigned int a, unsigned int b)
1562 {
1563  unsigned int r;
1564 #if __CUDA_ARCH__ >= 300
1565  unsigned int c = 0;
1566  asm ("vset4.s32.s32.le %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
1567  c = r << 8; // convert bool
1568  r = c - r; // to mask
1569 #else /* __CUDA_ARCH__ >= 300 */
1570  /* a <= b <===> a + ~b < 0 */
1571  asm ("{ \n\t"
1572  ".reg .u32 a,b,r,s,t,u; \n\t"
1573  "mov.b32 a,%1; \n\t"
1574  "mov.b32 b,%2; \n\t"
1575  "not.b32 u,b; \n\t" // ~b
1576  "and.b32 r,a,0x7f7f7f7f;\n\t" // clear msbs
1577  "and.b32 t,u,0x7f7f7f7f;\n\t" // clear msbs
1578  "xor.b32 u,a,b; \n\t" // sum bits = (a ^ b)
1579  "add.u32 r,r,t; \n\t" // capture msb carry-in in bit 7
1580  "xor.b32 t,a,r; \n\t" // a ^ carry_in
1581  "not.b32 s,u; \n\t" // ~(a ^ b)
1582  "and.b32 t,t,u; \n\t" // msb: ovfl = (a ^ carry_in) & (a ^ b)
1583  "xor.b32 r,r,s; \n\t" // msb: result = (a ^ ~b ^ carry_in)
1584  "xor.b32 t,t,r; \n\t" // msb: le = ovfl != sign(res)
1585 #if __CUDA_ARCH__ >= 200
1586  "prmt.b32 r,t,0,0xba98; \n\t" // build mask from msbs
1587 #else /* __CUDA_ARCH__ >= 200 */
1588  "and.b32 t,t,0x80808080;\n\t" // isolate msbs
1589  "shr.u32 r,t,7; \n\t" // build mask
1590  "sub.u32 r,t,r; \n\t" // from
1591  "or.b32 r,r,t; \n\t" // msbs
1592 #endif /* __CUDA_ARCH__ >= 200 */
1593  "mov.b32 %0,r; \n\t"
1594  "}"
1595  : "=r"(r) : "r"(a), "r"(b));
1596 #endif /* __CUDA_ARCH__ >= 300 */
1597  return r; // byte-wise signed lt-eq comparison with mask result
1598 }
1599 
1600 static __device__ __forceinline__ unsigned int vcmpleu4(unsigned int a, unsigned int b)
1601 {
1602  unsigned int r, c;
1603 #if __CUDA_ARCH__ >= 300
1604  c = 0;
1605  asm ("vset4.u32.u32.le %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
1606  c = r << 8; // convert bool
1607  r = c - r; // to mask
1608 #else /* __CUDA_ARCH__ >= 300 */
1609  asm ("not.b32 %0,%0;" : "+r"(a));
1610  c = vavgu4 (a, b); // (b + ~a + 1) / 2 = (b - a) / 2
1611 #if __CUDA_ARCH__ >= 200
1612  asm ("prmt.b32 %0,%1,0,0xba98;" : "=r"(r) : "r"(c));// build mask from msbs
1613 #else /* __CUDA_ARCH__ >= 200 */
1614  asm ("and.b32 %0,%0,0x80808080;" : "+r"(c)); // msb = carry-outs
1615  asm ("shr.u32 %0,%1,7;" : "=r"(r) : "r"(c)); // build mask
1616  asm ("sub.u32 %0,%1,%0;" : "+r"(r) : "r"(c)); // from
1617  asm ("or.b32 %0,%1,%0;" : "+r"(r) : "r"(c)); // msbs
1618 #endif /* __CUDA_ARCH__ >= 200 */
1619 #endif /* __CUDA_ARCH__ >= 300 */
1620  return r; // byte-wise unsigned lt-eq comparison with mask result
1621 }
1622 
1623 static __device__ __forceinline__ unsigned int vcmplts4(unsigned int a, unsigned int b)
1624 {
1625  unsigned int r;
1626 #if __CUDA_ARCH__ >= 300
1627  unsigned int c = 0;
1628  asm ("vset4.s32.s32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
1629  c = r << 8; // convert bool
1630  r = c - r; // to mask
1631 #else /* __CUDA_ARCH__ >= 300 */
1632  asm ("{ \n\t"
1633  ".reg .u32 a, b, r, s, t, u;\n\t"
1634  "mov.b32 a,%1; \n\t"
1635  "mov.b32 b,%2; \n\t"
1636  "not.b32 u,b; \n\t" // ~b
1637  "xor.b32 s,u,a; \n\t" // a ^ ~b
1638  "or.b32 r,a,0x80808080;\n\t" // set msbs
1639  "and.b32 t,b,0x7f7f7f7f;\n\t" // clear msbs
1640  "sub.u32 r,r,t; \n\t" // subtract lsbs, msb: ~borrow-in
1641  "xor.b32 t,r,a; \n\t" // msb: ~borrow-in ^ a
1642  "not.b32 u,s; \n\t" // msb: ~(a^~b)
1643  "xor.b32 r,r,s; \n\t" // msb: res = a ^ ~b ^ ~borrow-in
1644  "and.b32 t,t,u; \n\t" // msb: ovfl= (~bw-in ^ a) & ~(a^~b)
1645  "xor.b32 t,t,r; \n\t" // msb: lt = ovfl != sign(res)
1646 #if __CUDA_ARCH__ >= 200
1647  "prmt.b32 r,t,0,0xba98; \n\t" // build mask from msbs
1648 #else /* __CUDA_ARCH__ >= 200 */
1649  "and.b32 t,t,0x80808080;\n\t" // isolate msbs
1650  "shr.u32 r,t,7; \n\t" // build mask
1651  "sub.u32 r,t,r; \n\t" // from
1652  "or.b32 r,r,t; \n\t" // msbs
1653 #endif /* __CUDA_ARCH__ >= 200 */
1654  "mov.b32 %0,r; \n\t"
1655  "}"
1656  : "=r"(r) : "r"(a), "r"(b));
1657 #endif /* __CUDA_ARCH__ >= 300 */
1658  return r; // byte-wise signed lt comparison with mask result
1659 }
1660 
1661 static __device__ __forceinline__ unsigned int vcmpltu4(unsigned int a, unsigned int b)
1662 {
1663  unsigned int r, c;
1664 #if __CUDA_ARCH__ >= 300
1665  c = 0;
1666  asm ("vset4.u32.u32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
1667  c = r << 8; // convert bool
1668  r = c - r; // to mask
1669 #else /* __CUDA_ARCH__ >= 300 */
1670  asm ("not.b32 %0,%0;" : "+r"(a));
1671  c = vhaddu4 (a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
1672 #if __CUDA_ARCH__ >= 200
1673  asm ("prmt.b32 %0,%1,0,0xba98;" : "=r"(r) : "r"(c));// build mask from msbs
1674 #else /* __CUDA_ARCH__ >= 200 */
1675  asm ("and.b32 %0,%0,0x80808080;" : "+r"(c)); // msb = carry-outs
1676  asm ("shr.u32 %0,%1,7;" : "=r"(r) : "r"(c)); // build mask
1677  asm ("sub.u32 %0,%1,%0;" : "+r"(r) : "r"(c)); // from
1678  asm ("or.b32 %0,%1,%0;" : "+r"(r) : "r"(c)); // msbs
1679 #endif /* __CUDA_ARCH__ >= 200 */
1680 #endif /* __CUDA_ARCH__ >= 300 */
1681  return r; // byte-wise unsigned lt comparison with mask result
1682 }
1683 
1684 static __device__ __forceinline__ unsigned int vcmpne4(unsigned int a, unsigned int b)
1685 {
1686  unsigned int r, c;
1687 #if __CUDA_ARCH__ >= 300
1688  c = 0;
1689  asm ("vset4.u32.u32.ne %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
1690  c = r << 8; // convert bool
1691  r = c - r; // to mask
1692 #else /* __CUDA_ARCH__ >= 300 */
1693  // inspired by Alan Mycroft's null-byte detection algorithm:
1694  // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
1695  r = a ^ b; // 0x00 if a == b
1696  c = r | 0x80808080; // set msbs, to catch carry out
1697  c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
1698  c = r | c; // msb = 1, if r was not 0x00
1699 #if __CUDA_ARCH__ >= 200
1700  asm ("prmt.b32 %0,%1,0,0xba98;" : "=r"(r) : "r"(c));// build mask from msbs
1701 #else /* __CUDA_ARCH__ >= 200 */
1702  asm ("and.b32 %0,%0,0x80808080;" : "+r"(c)); // extract msbs
1703  asm ("shr.u32 %0,%1,7;" : "=r"(r) : "r"(c)); // build mask
1704  asm ("sub.u32 %0,%1,%0;" : "+r"(r) : "r"(c)); // from
1705  asm ("or.b32 %0,%1,%0;" : "+r"(r) : "r"(c)); // msbs
1706 #endif /* __CUDA_ARCH__ >= 200 */
1707 #endif /* __CUDA_ARCH__ >= 300 */
1708  return r; // byte-wise (un)signed ne comparison with mask result
1709 }
1710 
1711 static __device__ __forceinline__ unsigned int vabsdiffu4(unsigned int a, unsigned int b)
1712 {
1713  unsigned int r, s;
1714 #if __CUDA_ARCH__ >= 300
1715  s = 0;
1716  asm ("vabsdiff4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) :"r"(a),"r"(b),"r"(s));
1717 #else /* __CUDA_ARCH__ >= 300 */
1718  s = vcmpgeu4 (a, b);// mask = 0xff if a >= b
1719  r = a ^ b; //
1720  s = (r & s) ^ b; // select a when a >= b, else select b => max(a,b)
1721  r = s ^ r; // select a when b >= a, else select b => min(a,b)
1722  r = s - r; // |a - b| = max(a,b) - min(a,b);
1723 #endif /* __CUDA_ARCH__ >= 300 */
1724  return r; // byte-wise absolute difference of unsigned integers
1725 }
1726 
1727 static __device__ __forceinline__ unsigned int vmaxs4(unsigned int a, unsigned int b)
1728 {
1729  unsigned int r, s;
1730 #if __CUDA_ARCH__ >= 300
1731  s = 0;
1732  asm ("vmax4.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(s));
1733 #else /* __CUDA_ARCH__ >= 300 */
1734  s = vcmpges4 (a, b);// mask = 0xff if a >= b
1735  r = a & s; // select a when b >= a
1736  s = b & ~s; // select b when b < a
1737  r = r | s; // combine byte selections
1738 #endif /* __CUDA_ARCH__ >= 300 */
1739  return r; // byte-wise maximum of signed integers
1740 }
1741 
1742 static __device__ __forceinline__ unsigned int vmaxu4(unsigned int a, unsigned int b)
1743 {
1744  unsigned int r, s;
1745 #if __CUDA_ARCH__ >= 300
1746  s = 0;
1747  asm ("vmax4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(s));
1748 #else /* __CUDA_ARCH__ >= 300 */
1749  s = vcmpgeu4 (a, b);// mask = 0xff if a >= b
1750  r = a & s; // select a when b >= a
1751  s = b & ~s; // select b when b < a
1752  r = r | s; // combine byte selections
1753 #endif /* __CUDA_ARCH__ >= 300 */
1754  return r; // byte-wise maximum of unsigned integers
1755 }
1756 
1757 static __device__ __forceinline__ unsigned int vmins4(unsigned int a, unsigned int b)
1758 {
1759  unsigned int r, s;
1760 #if __CUDA_ARCH__ >= 300
1761  s = 0;
1762  asm ("vmin4.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(s));
1763 #else /* __CUDA_ARCH__ >= 300 */
1764  s = vcmpges4 (b, a);// mask = 0xff if a >= b
1765  r = a & s; // select a when b >= a
1766  s = b & ~s; // select b when b < a
1767  r = r | s; // combine byte selections
1768 #endif /* __CUDA_ARCH__ >= 300 */
1769  return r; // byte-wise minimum of signed integers
1770 }
1771 
1772 static __device__ __forceinline__ unsigned int vminu4(unsigned int a, unsigned int b)
1773 {
1774  unsigned int r, s;
1775 #if __CUDA_ARCH__ >= 300
1776  s = 0;
1777  asm ("vmin4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(s));
1778 #else /* __CUDA_ARCH__ >= 300 */
1779  s = vcmpgeu4 (b, a);// mask = 0xff if a >= b
1780  r = a & s; // select a when b >= a
1781  s = b & ~s; // select b when b < a
1782  r = r | s; // combine byte selections
1783 #endif /* __CUDA_ARCH__ >= 300 */
1784  return r; // byte-wise minimum of unsigned integers
1785 }
1786 static __device__ __forceinline__ unsigned int vseteq4(unsigned int a, unsigned int b)
1787 {
1788  unsigned int r, c;
1789 #if __CUDA_ARCH__ >= 300
1790  c = 0;
1791  asm ("vset4.u32.u32.eq %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
1792 #else /* __CUDA_ARCH__ >= 300 */
1793  // inspired by Alan Mycroft's null-byte detection algorithm:
1794  // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
1795  r = a ^ b; // 0x00 if a == b
1796  c = r | 0x80808080; // set msbs, to catch carry out
1797  r = r ^ c; // extract msbs, msb = 1 if r < 0x80
1798  c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
1799  c = r & ~c; // msb = 1, if r was 0x00
1800  r = c >> 7; // convert to bool
1801 #endif /* __CUDA_ARCH__ >= 300 */
1802  return r; // byte-wise (un)signed eq comparison with bool result
1803 }
1804 
1805 static __device__ __forceinline__ unsigned int vsetles4(unsigned int a, unsigned int b)
1806 {
1807  unsigned int r;
1808 #if __CUDA_ARCH__ >= 300
1809  unsigned int c = 0;
1810  asm ("vset4.s32.s32.le %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
1811 #else /* __CUDA_ARCH__ >= 300 */
1812  /* a <= b <===> a + ~b < 0 */
1813  asm ("{ \n\t"
1814  ".reg .u32 a,b,r,s,t,u; \n\t"
1815  "mov.b32 a,%1; \n\t"
1816  "mov.b32 b,%2; \n\t"
1817  "not.b32 u,b; \n\t" // ~b
1818  "and.b32 r,a,0x7f7f7f7f;\n\t" // clear msbs
1819  "and.b32 t,u,0x7f7f7f7f;\n\t" // clear msbs
1820  "xor.b32 u,a,b; \n\t" // sum bits = (a ^ b)
1821  "add.u32 r,r,t; \n\t" // capture msb carry-in in bit 7
1822  "xor.b32 t,a,r; \n\t" // a ^ carry_in
1823  "not.b32 s,u; \n\t" // ~(a ^ b)
1824  "and.b32 t,t,u; \n\t" // msb: ovfl = (a ^ carry_in) & (a ^ b)
1825  "xor.b32 r,r,s; \n\t" // msb: result = (a ^ ~b ^ carry_in)
1826  "xor.b32 t,t,r; \n\t" // msb: le = ovfl != sign(res)
1827  "and.b32 t,t,0x80808080;\n\t" // isolate msbs
1828  "shr.u32 r,t,7; \n\t" // convert to bool
1829  "mov.b32 %0,r; \n\t"
1830  "}"
1831  : "=r"(r) : "r"(a), "r"(b));
1832 #endif /* __CUDA_ARCH__ >= 300 */
1833  return r; // byte-wise signed lt-eq comparison with bool result
1834 }
1835 
1836 static __device__ __forceinline__ unsigned int vsetleu4(unsigned int a, unsigned int b)
1837 {
1838  unsigned int r, c;
1839 #if __CUDA_ARCH__ >= 300
1840  c = 0;
1841  asm ("vset4.u32.u32.le %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
1842 #else /* __CUDA_ARCH__ >= 300 */
1843  asm ("not.b32 %0,%0;" : "+r"(a));
1844  c = vavgu4 (a, b); // (b + ~a + 1) / 2 = (b - a) / 2
1845  c = c & 0x80808080; // msb = carry-outs
1846  r = c >> 7; // convert to bool
1847 #endif /* __CUDA_ARCH__ >= 300 */
1848  return r; // byte-wise unsigned lt-eq comparison with bool result
1849 }
1850 
1851 static __device__ __forceinline__ unsigned int vsetlts4(unsigned int a, unsigned int b)
1852 {
1853  unsigned int r;
1854 #if __CUDA_ARCH__ >= 300
1855  unsigned int c = 0;
1856  asm ("vset4.s32.s32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
1857 #else /* __CUDA_ARCH__ >= 300 */
1858  asm ("{ \n\t"
1859  ".reg .u32 a, b, r, s, t, u;\n\t"
1860  "mov.b32 a,%1; \n\t"
1861  "mov.b32 b,%2; \n\t"
1862  "not.b32 u,b; \n\t" // ~b
1863  "or.b32 r,a,0x80808080;\n\t" // set msbs
1864  "and.b32 t,b,0x7f7f7f7f;\n\t" // clear msbs
1865  "xor.b32 s,u,a; \n\t" // a ^ ~b
1866  "sub.u32 r,r,t; \n\t" // subtract lsbs, msb: ~borrow-in
1867  "xor.b32 t,r,a; \n\t" // msb: ~borrow-in ^ a
1868  "not.b32 u,s; \n\t" // msb: ~(a^~b)
1869  "xor.b32 r,r,s; \n\t" // msb: res = a ^ ~b ^ ~borrow-in
1870  "and.b32 t,t,u; \n\t" // msb: ovfl= (~bw-in ^ a) & ~(a^~b)
1871  "xor.b32 t,t,r; \n\t" // msb: lt = ovfl != sign(res)
1872  "and.b32 t,t,0x80808080;\n\t" // isolate msbs
1873  "shr.u32 r,t,7; \n\t" // convert to bool
1874  "mov.b32 %0,r; \n\t"
1875  "}"
1876  : "=r"(r) : "r"(a), "r"(b));
1877 #endif /* __CUDA_ARCH__ >= 300 */
1878  return r; // byte-wise signed lt comparison with bool result
1879 }
1880 
1881 static __device__ __forceinline__ unsigned int vsetltu4(unsigned int a, unsigned int b)
1882 {
1883  unsigned int r, c;
1884 #if __CUDA_ARCH__ >= 300
1885  c = 0;
1886  asm ("vset4.u32.u32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
1887 #else /* __CUDA_ARCH__ >= 300 */
1888  asm ("not.b32 %0,%0;" : "+r"(a));
1889  c = vhaddu4 (a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
1890  c = c & 0x80808080; // msb = carry-outs
1891  r = c >> 7; // convert to bool
1892 #endif /* __CUDA_ARCH__ >= 300 */
1893  return r; // byte-wise unsigned lt comparison with bool result
1894 }
1895 
1896 static __device__ __forceinline__ unsigned int vsetges4(unsigned int a, unsigned int b)
1897 {
1898  unsigned int r;
1899 #if __CUDA_ARCH__ >= 300
1900  unsigned int c = 0;
1901  asm ("vset4.s32.s32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
1902 #else /* __CUDA_ARCH__ >= 300 */
1903  asm ("{ \n\t"
1904  ".reg .u32 a, b, r, s, t, u;\n\t"
1905  "mov.b32 a,%1; \n\t"
1906  "mov.b32 b,%2; \n\t"
1907  "xor.b32 s,a,b; \n\t" // a ^ b
1908  "or.b32 r,a,0x80808080;\n\t" // set msbs
1909  "and.b32 t,b,0x7f7f7f7f;\n\t" // clear msbs
1910  "sub.u32 r,r,t; \n\t" // subtract lsbs, msb: ~borrow-in
1911  "xor.b32 t,r,a; \n\t" // msb: ~borrow-in ^ a
1912  "xor.b32 r,r,s; \n\t" // msb: ~sign(res) = a^b^~borrow-in
1913  "and.b32 t,t,s; \n\t" // msb: ovfl= (~bw-in ^ a) & (a ^ b)
1914  "xor.b32 t,t,r; \n\t" // msb: ge = ovfl != ~sign(res)
1915  "and.b32 t,t,0x80808080;\n\t" // isolate msbs
1916  "shr.u32 r,t,7; \n\t" // convert to bool
1917  "mov.b32 %0,r; \n\t"
1918  "}"
1919  : "=r"(r) : "r"(a), "r"(b));
1920 #endif /* __CUDA_ARCH__ >= 300 */
1921  return r; // byte-wise signed gt-eq comparison with bool result
1922 }
1923 
1924 static __device__ __forceinline__ unsigned int vsetgeu4(unsigned int a, unsigned int b)
1925 {
1926  unsigned int r, c;
1927 #if __CUDA_ARCH__ >= 300
1928  c = 0;
1929  asm ("vset4.u32.u32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
1930 #else /* __CUDA_ARCH__ >= 300 */
1931  asm ("not.b32 %0,%0;" : "+r"(b));
1932  c = vavgu4 (a, b); // (a + ~b + 1) / 2 = (a - b) / 2
1933  c = c & 0x80808080; // msb = carry-outs
1934  r = c >> 7; // convert to bool
1935 #endif /* __CUDA_ARCH__ >= 300 */
1936  return r; // byte-wise unsigned gt-eq comparison with bool result
1937 }
1938 
1939 static __device__ __forceinline__ unsigned int vsetgts4(unsigned int a, unsigned int b)
1940 {
1941  unsigned int r;
1942 #if __CUDA_ARCH__ >= 300
1943  unsigned int c = 0;
1944  asm ("vset4.s32.s32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
1945 #else /* __CUDA_ARCH__ >= 300 */
1946  /* a <= b <===> a + ~b < 0 */
1947  asm ("{ \n\t"
1948  ".reg .u32 a,b,r,s,t,u; \n\t"
1949  "mov.b32 a,%1; \n\t"
1950  "mov.b32 b,%2; \n\t"
1951  "not.b32 b,b; \n\t"
1952  "and.b32 r,a,0x7f7f7f7f;\n\t" // clear msbs
1953  "and.b32 t,b,0x7f7f7f7f;\n\t" // clear msbs
1954  "xor.b32 s,a,b; \n\t" // sum bits = (a ^ b)
1955  "add.u32 r,r,t; \n\t" // capture msb carry-in in bit 7
1956  "xor.b32 t,a,r; \n\t" // a ^ carry_in
1957  "not.b32 u,s; \n\t" // ~(a ^ b)
1958  "and.b32 t,t,u; \n\t" // msb: ovfl = (a ^ carry_in) & ~(a^b)
1959  "xor.b32 r,r,u; \n\t" // msb: ~result = (~(a ^ b) ^ carry_in)
1960  "xor.b32 t,t,r; \n\t" // msb: gt = ovfl != sign(~res)
1961  "and.b32 t,t,0x80808080;\n\t" // isolate msbs
1962  "shr.u32 r,t,7; \n\t" // convert to bool
1963  "mov.b32 %0,r; \n\t"
1964  "}"
1965  : "=r"(r) : "r"(a), "r"(b));
1966 #endif /* __CUDA_ARCH__ >= 300 */
1967  return r; // byte-wise signed gt comparison with mask result
1968 }
1969 
1970 static __device__ __forceinline__ unsigned int vsetgtu4(unsigned int a, unsigned int b)
1971 {
1972  unsigned int r, c;
1973 #if __CUDA_ARCH__ >= 300
1974  c = 0;
1975  asm ("vset4.u32.u32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
1976 #else /* __CUDA_ARCH__ >= 300 */
1977  asm ("not.b32 %0,%0;" : "+r"(b));
1978  c = vhaddu4 (a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
1979  c = c & 0x80808080; // msb = carry-outs
1980  r = c >> 7; // convert to bool
1981 #endif /* __CUDA_ARCH__ >= 300 */
1982  return r; // byte-wise unsigned gt comparison with bool result
1983 }
1984 
1985 static __device__ __forceinline__ unsigned int vsetne4(unsigned int a, unsigned int b)
1986 {
1987  unsigned int r, c;
1988 #if __CUDA_ARCH__ >= 300
1989  c = 0;
1990  asm ("vset4.u32.u32.ne %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
1991 #else /* __CUDA_ARCH__ >= 300 */
1992  // inspired by Alan Mycroft's null-byte detection algorithm:
1993  // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
1994  r = a ^ b; // 0x00 if a == b
1995  c = r | 0x80808080; // set msbs, to catch carry out
1996  c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
1997  c = r | c; // msb = 1, if r was not 0x00
1998  c = c & 0x80808080; // extract msbs
1999  r = c >> 7; // convert to bool
2000 #endif /* __CUDA_ARCH__ >= 300 */
2001  return r; // byte-wise (un)signed ne comparison with bool result
2002 }
2003 
2004 static __device__ __forceinline__ unsigned int vsadu4(unsigned int a, unsigned int b)
2005 {
2006  unsigned int r, s;
2007 #if __CUDA_ARCH__ >= 300
2008  s = 0;
2009  asm("vabsdiff4.u32.u32.u32.add %0,%1,%2,%3;":"=r"(r):"r"(a),"r"(b),"r"(s));
2010 #else /* __CUDA_ARCH__ >= 300 */
2011  r = vabsdiffu4 (a, b);
2012  s = r >> 8;
2013  r = (r & 0x00ff00ff) + (s & 0x00ff00ff);
2014  r = ((r << 16) + r) >> 16;
2015 #endif /* __CUDA_ARCH__ >= 300 */
2016  return r; // byte-wise sum of absol. differences of unsigned ints
2017 }
2018 
2019 static __device__ __forceinline__ unsigned int vsub4(unsigned int a, unsigned int b)
2020 {
2021 #if __CUDA_ARCH__ >= 300
2022  unsigned int r, c = 0;
2023  asm ("vsub4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(a), "r"(b), "r"(c));
2024 #else /* __CUDA_ARCH__ >= 300 */
2025  unsigned int r, s, t;
2026  s = a ^ ~b; // inverted sum bits
2027  r = a | 0x80808080; // set msbs
2028  t = b & 0x7f7f7f7f; // clear msbs
2029  s = s & 0x80808080; // inverted msb sum bits
2030  r = r - t; // subtract w/o msbs, record inverted borrows in msb
2031  r = r ^ s; // combine inverted msb sum bits and borrows
2032 #endif /* __CUDA_ARCH__ >= 300 */
2033  return r; // byte-wise difference
2034 }
2035 
2036 static __device__ __forceinline__ unsigned int vsubss4(unsigned int a, unsigned int b)
2037 {
2038  unsigned int r;
2039 #if __CUDA_ARCH__ >= 300
2040  unsigned int c = 0;
2041  asm ("vsub4.s32.s32.s32.sat %0,%1,%2,%3;" : "=r"(r) :"r"(a),"r"(b),"r"(c));
2042 #else /* __CUDA_ARCH__ >= 300 */
2043  /*
2044  For signed saturation, saturation is controlled by the overflow signal:
2045  ovfl = (borrow-in to msb) XOR (borrow-out from msb). Overflow can only
2046  occur when the msbs of both inputs are differemt. The defined response to
2047  overflow is to deliver 0x7f when the addends are positive (bit 7 clear),
2048  and 0x80 when the addends are negative (bit 7 set). The truth table for
2049  the msb is
2050 
2051  a b bw_in res bw_out ovfl a^~bw_in ~(a^~b) (a^~bw_in)&~(a^~b)
2052  ---------------------------------------------------------------------
2053  0 0 0 0 0 0 1 0 0
2054  0 0 1 1 1 0 0 0 0
2055  0 1 0 1 1 1 1 1 1
2056  0 1 1 0 1 0 0 1 0
2057  1 0 0 1 0 0 0 1 0
2058  1 0 1 0 0 1 1 1 1
2059  1 1 0 0 0 0 0 0 0
2060  1 1 1 1 1 0 1 0 0
2061 
2062  The seven low-order bits can be handled by wrapping subtraction with the
2063  borrow-out from bit 6 recorded in the msb (thus corresponding to the
2064  bw_in in the truth table for the msb above). ovfl can be computed in many
2065  equivalent ways, here we use ovfl = (a ^ ~borrow_in) & ~(a ^~b) since we
2066  already need to compute (a ^~b) and ~borrow-in for the msb result bit
2067  computation. First we compute the normal, wrapped subtraction result.
2068  When overflow is detected, we mask off the result's msb, then compute a
2069  mask covering the seven low order bits, which are all set to 1. This sets
2070  the byte to 0x7f as we previously cleared the msb. In the overflow case,
2071  the sign of the result matches the sign of input a, so we extract the
2072  sign of a and add it to the low order bits, which turns 0x7f into 0x80,
2073  the correct result for an overflowed negative result.
2074  */
2075  asm ("{ \n\t"
2076  ".reg .u32 a,b,r,s,t,u,v,w; \n\t"
2077  "mov.b32 a,%1; \n\t"
2078  "mov.b32 b,%2; \n\t"
2079  "not.b32 u,b; \n\t" // ~b
2080  "xor.b32 s,u,a; \n\t" // a ^ ~b
2081  "or.b32 r,a,0x80808080;\n\t" // set msbs
2082  "and.b32 t,b,0x7f7f7f7f;\n\t" // clear msbs
2083  "sub.u32 r,r,t; \n\t" // subtract lsbs, msb: ~borrow-in
2084  "xor.b32 t,r,a; \n\t" // msb: ~borrow-in ^ a
2085  "not.b32 u,s; \n\t" // msb: ~(a^~b)
2086  "and.b32 s,s,0x80808080;\n\t" // msb: a ^ ~b
2087  "xor.b32 r,r,s; \n\t" // msb: res = a ^ ~b ^ ~borrow-in
2088  "and.b32 t,t,u; \n\t" // msb: ovfl= (~bw-in ^ a) & ~(a^~b)
2089 #if __CUDA_ARCH__ >= 200
2090  "prmt.b32 s,a,0,0xba98; \n\t" // sign(a) ? 0xff : 0
2091  "xor.b32 s,s,0x7f7f7f7f;\n\t" // sign(a) ? 0x80 : 0x7f
2092  "prmt.b32 t,t,0,0xba98; \n\t" // ovfl ? 0xff : 0
2093  "and.b32 s,s,t; \n\t" // ovfl ? (sign(a) ? 0x80:0x7f) : 0
2094  "not.b32 t,t; \n\t" // ~ovfl
2095  "and.b32 r,r,t; \n\t" // ovfl ? 0 : a + b
2096  "or.b32 r,r,s; \n\t" // ovfl ? (sign(a) ? 0x80:0x7f) :a+b
2097 #else /* __CUDA_ARCH__ >= 200 */
2098  "and.b32 t,t,0x80808080;\n\t" // ovfl ? 0x80 : 0
2099  "shr.u32 s,t,7; \n\t" // ovfl ? 1 : 0
2100  "not.b32 u,t; \n\t" // ovfl ? 0x7f : 0xff
2101  "and.b32 r,r,u; \n\t" // ovfl ? (a - b) & 0x7f : a - b
2102  "and.b32 u,a,t; \n\t" // ovfl ? a & 0x80 : 0
2103  "sub.u32 t,t,s; \n\t" // ovfl ? 0x7f : 0
2104  "shr.u32 u,u,7; \n\t" // ovfl ? sign(a) : 0
2105  "or.b32 r,r,t; \n\t" // ovfl ? 0x7f : a - b
2106  "add.u32 r,r,u; \n\t" // ovfl ? 0x7f+sign(a) : a - b
2107 #endif /* __CUDA_ARCH__ >= 200 */
2108  "mov.b32 %0,r; \n\t"
2109  "}"
2110  : "=r"(r) : "r"(a), "r"(b));
2111 #endif /* __CUDA_ARCH__ >= 300 */
2112  return r; // byte-wise difference with signed saturation
2113 }
2114 
2115 static __device__ __forceinline__ unsigned int vsubus4(unsigned int a, unsigned int b)
2116 {
2117  unsigned int r;
2118 #if __CUDA_ARCH__ >= 300
2119  unsigned int c = 0;
2120  asm ("vsub4.u32.u32.u32.sat %0,%1,%2,%3;" : "=r"(r) :"r"(a),"r"(b),"r"(c));
2121 #else /* __CUDA_ARCH__ >= 300 */
2122  // This code uses the same basic approach used for the non-saturating
2123  // subtraction. The seven low-order bits in each byte are subtracted by
2124  // regular subtraction with the inverse of the borrow-out from bit 6 (=
2125  // inverse of borrow-in for the msb) being recorded in bit 7, while the
2126  // msb is handled separately.
2127  //
2128  // Clamping to 0 needs happens when there is a borrow-out from the msb.
2129  // This is simply accomplished by ANDing the normal addition result with
2130  // a mask based on the inverted msb borrow-out: ~borrow-out ? 0xff : 0x00.
2131  // The borrow-out information is generated from the msb. Since we already
2132  // have the msb's ~borrow-in and (a^~b) available from the computation of
2133  // the msb result bit, the most efficient way to compute msb ~borrow-out
2134  // is: ((a ^ ~b) & ~borrow-in) | (~b & a). The truth table for the msb is
2135  //
2136  // a b bw-in res ~bw-out a^~b (a^~b)&~bw-in (a&~b) ((a^~b)&~bw-in)|(a&~b)
2137  //
2138  // 0 0 0 0 1 1 1 0 1
2139  // 0 0 1 1 0 1 0 0 0
2140  // 0 1 0 1 0 0 0 0 0
2141  // 0 1 1 0 0 0 0 0 0
2142  // 1 0 0 1 1 0 0 1 1
2143  // 1 0 1 0 1 0 0 1 1
2144  // 1 1 0 0 1 1 1 0 1
2145  // 1 1 1 1 0 1 0 0 0
2146  //
2147  asm ("{ \n\t"
2148  ".reg .u32 a,b,r,s,t,u; \n\t"
2149  "mov.b32 a,%1; \n\t"
2150  "mov.b32 b,%2; \n\t"
2151  "not.b32 u,b; \n\t" // ~b
2152  "xor.b32 s,u,a; \n\t" // a ^ ~b
2153  "and.b32 u,u,a; \n\t" // a & ~b
2154  "or.b32 r,a,0x80808080;\n\t" // set msbs
2155  "and.b32 t,b,0x7f7f7f7f;\n\t" // clear msbs
2156  "sub.u32 r,r,t; \n\t" // subtract lsbs, msb: ~borrow-in
2157  "and.b32 t,r,s; \n\t" // msb: (a ^ ~b) & ~borrow-in
2158  "and.b32 s,s,0x80808080;\n\t" // msb: a ^ ~b
2159  "xor.b32 r,r,s; \n\t" // msb: res = a ^ ~b ^ ~borrow-in
2160  "or.b32 t,t,u; \n\t" // msb: bw-out = ((a^~b)&~bw-in)|(a&~b)
2161 #if __CUDA_ARCH__ >= 200
2162  "prmt.b32 t,t,0,0xba98; \n\t" // ~borrow-out ? 0xff : 0
2163 #else /* __CUDA_ARCH__ >= 200 */
2164  "and.b32 t,t,0x80808080;\n\t" // isolate msb: ~borrow-out
2165  "shr.u32 s,t,7; \n\t" // build mask
2166  "sub.u32 s,t,s; \n\t" // from
2167  "or.b32 t,t,s; \n\t" // msb
2168 #endif /* __CUDA_ARCH__ >= 200 */
2169  "and.b32 r,r,t; \n\t" // cond. clear result if msb borrow-out
2170  "mov.b32 %0,r; \n\t"
2171  "}"
2172  : "=r"(r) : "r"(a) , "r"(b));
2173 #endif /* __CUDA_ARCH__ >= 300 */
2174  return r; // byte-wise difference with unsigned saturation
2175 }
2176 
2177 static __device__ __forceinline__ unsigned int vneg4(unsigned int a)
2178 {
2179  return vsub4 (0, a);// byte-wise negation with wrap-around
2180 }
2181 
2182 static __device__ __forceinline__ unsigned int vnegss4(unsigned int a)
2183 {
2184  unsigned int r;
2185 #if __CUDA_ARCH__ >= 300
2186  unsigned int s = 0;
2187  asm ("vsub4.s32.s32.s32.sat %0,%1,%2,%3;" : "=r"(r) :"r"(s),"r"(a),"r"(s));
2188 #else /* __CUDA_ARCH__ >= 300 */
2189  r = vsub4 (0, a); //
2190  asm ("{ \n\t"
2191  ".reg .u32 a, r, s; \n\t"
2192  "mov.b32 r,%0; \n\t"
2193  "mov.b32 a,%1; \n\t"
2194  "and.b32 a,a,0x80808080;\n\t" // extract msb
2195  "and.b32 s,a,r; \n\t" // wrap-around if msb set in a and -a
2196  "shr.u32 s,s,7; \n\t" // msb ? 1 : 0
2197  "sub.u32 r,r,s; \n\t" // subtract 1 if result wrapped around
2198  "mov.b32 %0,r; \n\t"
2199  "}"
2200  : "+r"(r) : "r"(a));
2201 #endif /* __CUDA_ARCH__ >= 300 */
2202  return r; // byte-wise negation with signed saturation
2203 }
2204 
2205 static __device__ __forceinline__ unsigned int vabsdiffs4(unsigned int a, unsigned int b)
2206 {
2207  unsigned int r, s;
2208 #if __CUDA_ARCH__ >= 300
2209  s = 0;
2210  asm ("vabsdiff4.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) :"r"(a),"r"(b),"r"(s));
2211 #else /* __CUDA_ARCH__ >= 300 */
2212  s = vcmpges4 (a, b);// mask = 0xff if a >= b
2213  r = a ^ b; //
2214  s = (r & s) ^ b; // select a when a >= b, else select b => max(a,b)
2215  r = s ^ r; // select a when b >= a, else select b => min(a,b)
2216  r = vsub4 (s, r); // |a - b| = max(a,b) - min(a,b);
2217 #endif /* __CUDA_ARCH__ >= 300 */
2218  return r; // byte-wise absolute difference of signed integers
2219 }
2220 
2221 static __device__ __forceinline__ unsigned int vsads4(unsigned int a, unsigned int b)
2222 {
2223  unsigned int r, s;
2224 #if __CUDA_ARCH__ >= 300
2225  s = 0;
2226  asm("vabsdiff4.s32.s32.s32.add %0,%1,%2,%3;":"=r"(r):"r"(a),"r"(b),"r"(s));
2227 #else /* __CUDA_ARCH__ >= 300 */
2228  r = vabsdiffs4 (a, b);
2229  s = r >> 8;
2230  r = (r & 0x00ff00ff) + (s & 0x00ff00ff);
2231  r = ((r << 16) + r) >> 16;
2232 #endif /* __CUDA_ARCH__ >= 300 */
2233  return r; // byte-wise sum of absolute differences of signed ints
2234 }
2235 
2236 #endif /* SIMD_FUNCTIONS_H__ */