38 #if !defined (SIMD_FUNCTIONS_H__)
39 #define SIMD_FUNCTIONS_H__
133 static __device__ __forceinline__
unsigned int vabs2(
unsigned int a)
136 #if __CUDA_ARCH__ >= 300
137 unsigned int b = 0, c = 0;
138 asm (
"vabsdiff2.s32.s32.s32 %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
139 #elif __CUDA_ARCH__ >= 200
141 ".reg .u32 a,m,r; \n\t"
143 "prmt.b32 m,a,0,0xbb99; \n\t"
144 "xor.b32 r,a,m; \n\t"
145 "and.b32 m,m,0x00010001;\n\t"
146 "add.u32 r,r,m; \n\t"
152 ".reg .u32 a,m,r,s; \n\t"
154 "and.b32 m,a,0x80008000;\n\t"
155 "and.b32 r,a,0x7fff7fff;\n\t"
156 "shr.u32 s,m,15; \n\t"
157 "sub.u32 m,m,s; \n\t"
158 "xor.b32 r,r,m; \n\t"
159 "add.u32 r,r,s; \n\t"
167 static __device__ __forceinline__
unsigned int vabsss2(
unsigned int a)
170 #if __CUDA_ARCH__ >= 300
171 unsigned int b = 0, c = 0;
172 asm(
"vabsdiff2.s32.s32.s32.sat %0,%1,%2,%3;":
"=r"(r):
"r"(a),
"r"(b),
"r"(c));
173 #elif __CUDA_ARCH__ >= 200
175 ".reg .u32 a,m,r; \n\t"
177 "prmt.b32 m,a,0,0xbb99; \n\t"
178 "xor.b32 r,a,m; \n\t"
179 "and.b32 m,m,0x00010001;\n\t"
180 "add.u32 r,r,m; \n\t"
181 "prmt.b32 m,r,0,0xbb99; \n\t"
182 "and.b32 m,m,0x00010001;\n\t"
183 "sub.u32 r,r,m; \n\t"
189 ".reg .u32 a,m,r,s; \n\t"
191 "and.b32 m,a,0x80008000;\n\t"
192 "and.b32 r,a,0x7fff7fff;\n\t"
193 "shr.u32 s,m,15; \n\t"
194 "sub.u32 m,m,s; \n\t"
195 "xor.b32 r,r,m; \n\t"
196 "add.u32 r,r,s; \n\t"
197 "and.b32 m,r,0x80008000;\n\t"
198 "shr.u32 s,m,15; \n\t"
199 "sub.u32 r,r,s; \n\t"
207 static __device__ __forceinline__
unsigned int vadd2(
unsigned int a,
unsigned int b)
210 #if __CUDA_ARCH__ >= 300
212 asm (
"vadd2.u32.u32.u32 %0,%1,%2,%3;" :
"=r"(t) :
"r"(a),
"r"(b),
"r"(s));
223 static __device__ __forceinline__
unsigned int vaddss2 (
unsigned int a,
unsigned int b)
226 #if __CUDA_ARCH__ >= 300
228 asm (
"vadd2.s32.s32.s32.sat %0,%1,%2,%3;" :
"=r"(r):
"r"(a),
"r"(b),
"r"(c));
230 int ahi, alo, blo, bhi, rhi, rlo;
231 ahi = (int)((a & 0xffff0000U));
232 bhi = (int)((b & 0xffff0000U));
233 #if __CUDA_ARCH__ < 200
234 alo = (int)(a << 16);
235 blo = (int)(b << 16);
236 #elif __CUDA_ARCH__ < 350
238 asm (
"prmt.b32 %0,%1,0,0x1044;" :
"=r"(alo) :
"r"(a));
239 asm (
"prmt.b32 %0,%1,0,0x1044;" :
"=r"(blo) :
"r"(b));
241 asm (
"shf.l.clamp.b32 %0,0,%1,16;" :
"=r"(alo) :
"r"(a));
242 asm (
"shf.l.clamp.b32 %0,0,%1,16;" :
"=r"(blo) :
"r"(b));
244 asm (
"add.sat.s32 %0,%1,%2;" :
"=r"(rlo) :
"r"(alo),
"r"(blo));
245 asm (
"add.sat.s32 %0,%1,%2;" :
"=r"(rhi) :
"r"(ahi),
"r"(bhi));
246 #if __CUDA_ARCH__ < 200
247 r = ((
unsigned int)rhi & 0xffff0000U) | ((
unsigned int)rlo >> 16);
249 asm (
"prmt.b32 %0,%1,%2,0x7632;" :
"=r"(r) :
"r"(rlo),
"r"(rhi));
255 static __device__ __forceinline__
unsigned int vaddus2 (
unsigned int a,
unsigned int b)
258 #if __CUDA_ARCH__ >= 300
260 asm (
"vadd2.u32.u32.u32.sat %0,%1,%2,%3;" :
"=r"(r):
"r"(a),
"r"(b),
"r"(c));
262 int alo, blo, rlo, ahi, bhi, rhi;
264 "and.b32 %0, %4, 0xffff; \n\t"
265 "and.b32 %1, %5, 0xffff; \n\t"
266 #if __CUDA_ARCH__ < 350
267 "shr.u32 %2, %4, 16; \n\t"
268 "shr.u32 %3, %5, 16; \n\t"
270 "shf.r.clamp.b32 %2, %4, 0, 16;\n\t"
271 "shf.r.clamp.b32 %3, %5, 0, 16;\n\t"
274 :
"=r"(alo),
"=r"(blo),
"=r"(ahi),
"=r"(bhi)
276 rlo =
min (alo + blo, 65535);
277 rhi =
min (ahi + bhi, 65535);
278 r = (rhi << 16) + rlo;
283 static __device__ __forceinline__
unsigned int vavgs2(
unsigned int a,
unsigned int b)
286 #if __CUDA_ARCH__ >= 300
288 asm (
"vavrg2.s32.s32.s32 %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
301 ".reg .u32 a,b,c,r,s,t,u,v;\n\t"
304 "and.b32 u,a,0xfffefffe;\n\t"
305 "and.b32 v,b,0xfffefffe;\n\t"
306 "xor.b32 s,a,b; \n\t"
307 "and.b32 t,a,b; \n\t"
308 "shr.u32 u,u,1; \n\t"
309 "shr.u32 v,v,1; \n\t"
310 "and.b32 c,s,0x00010001;\n\t"
311 "and.b32 s,s,0x80008000;\n\t"
312 "and.b32 t,t,0x00010001;\n\t"
313 "add.u32 r,u,v; \n\t"
314 "add.u32 r,r,t; \n\t"
315 "xor.b32 r,r,s; \n\t"
316 "shr.u32 t,r,15; \n\t"
318 "and.b32 t,t,c; \n\t"
319 "add.u32 r,r,t; \n\t"
322 :
"=r"(r) :
"r"(a),
"r"(b));
327 static __device__ __forceinline__
unsigned int vavgu2(
unsigned int a,
unsigned int b)
330 #if __CUDA_ARCH__ >= 300
332 asm (
"vavrg2.u32.u32.u32 %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
345 static __device__ __forceinline__
unsigned int vhaddu2(
unsigned int a,
unsigned int b)
358 static __device__ __forceinline__
unsigned int vcmpeq2(
unsigned int a,
unsigned int b)
361 #if __CUDA_ARCH__ >= 300
363 asm (
"vset2.u32.u32.eq %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
374 #if __CUDA_ARCH__ >= 200
375 asm (
"prmt.b32 %0,%1,0,0xbb99;" :
"=r"(r) :
"r"(c));
377 asm (
"shr.u32 %0,%1,15;" :
"=r"(r) :
"r"(c));
378 asm (
"sub.u32 %0,%1,%0;" :
"+r"(r) :
"r"(c));
379 asm (
"or.b32 %0,%1,%0;" :
"+r"(r) :
"r"(c));
385 static __device__ __forceinline__
unsigned int vcmpges2(
unsigned int a,
unsigned int b)
388 #if __CUDA_ARCH__ >= 300
390 asm (
"vset2.s32.s32.ge %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
395 ".reg .u32 a, b, r, s, t, u; \n\t"
398 "and.b32 s,a,0xffff0000;\n\t"
399 "and.b32 t,b,0xffff0000;\n\t"
400 "set.ge.s32.s32 u,s,t; \n\t"
401 "cvt.s32.s16 s,a; \n\t"
402 "cvt.s32.s16 t,b; \n\t"
403 "set.ge.s32.s32 s,s,t; \n\t"
404 #if __CUDA_ARCH__ >= 200
405 "prmt.b32 r,s,u,0x7632; \n\t"
407 "and.b32 u,u,0xffff0000;\n\t"
408 "and.b32 s,s,0x0000ffff;\n\t"
413 :
"=r"(r) :
"r"(a),
"r"(b));
418 static __device__ __forceinline__
unsigned int vcmpgeu2(
unsigned int a,
unsigned int b)
421 #if __CUDA_ARCH__ >= 300
423 asm (
"vset2.u32.u32.ge %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
427 asm (
"not.b32 %0,%0;" :
"+r"(b));
429 #if __CUDA_ARCH__ >= 200
430 asm (
"prmt.b32 %0,%1,0,0xbb99;" :
"=r"(r) :
"r"(c));
432 asm (
"and.b32 %0,%0,0x80008000;" :
"+r"(c));
433 asm (
"shr.u32 %0,%1,15;" :
"=r"(r) :
"r"(c));
434 asm (
"sub.u32 %0,%1,%0;" :
"+r"(r) :
"r"(c));
435 asm (
"or.b32 %0,%1,%0;" :
"+r"(r) :
"r"(c));
441 static __device__ __forceinline__
unsigned int vcmpgts2(
unsigned int a,
unsigned int b)
444 #if __CUDA_ARCH__ >= 300
446 asm (
"vset2.s32.s32.gt %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
451 ".reg .u32 a, b, r, s, t, u; \n\t"
454 "and.b32 s,a,0xffff0000;\n\t"
455 "and.b32 t,b,0xffff0000;\n\t"
456 "set.gt.s32.s32 u,s,t; \n\t"
457 "cvt.s32.s16 s,a; \n\t"
458 "cvt.s32.s16 t,b; \n\t"
459 "set.gt.s32.s32 s,s,t; \n\t"
460 #if __CUDA_ARCH__ >= 200
461 "prmt.b32 r,s,u,0x7632; \n\t"
463 "and.b32 u,u,0xffff0000;\n\t"
464 "and.b32 s,s,0x0000ffff;\n\t"
469 :
"=r"(r) :
"r"(a),
"r"(b));
474 static __device__ __forceinline__
unsigned int vcmpgtu2(
unsigned int a,
unsigned int b)
477 #if __CUDA_ARCH__ >= 300
479 asm (
"vset2.u32.u32.gt %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
483 asm (
"not.b32 %0,%0;" :
"+r"(b));
485 #if __CUDA_ARCH__ >= 200
486 asm (
"prmt.b32 %0,%1,0,0xbb99;" :
"=r"(r) :
"r"(c));
488 asm (
"and.b32 %0,%0,0x80008000;" :
"+r"(c));
489 asm (
"shr.u32 %0,%1,15;" :
"=r"(r) :
"r"(c));
490 asm (
"sub.u32 %0,%1,%0;" :
"+r"(r) :
"r"(c));
491 asm (
"or.b32 %0,%1,%0;" :
"+r"(r) :
"r"(c));
497 static __device__ __forceinline__
unsigned int vcmples2(
unsigned int a,
unsigned int b)
500 #if __CUDA_ARCH__ >= 300
502 asm (
"vset2.s32.s32.le %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
507 ".reg .u32 a, b, r, s, t, u; \n\t"
510 "and.b32 s,a,0xffff0000;\n\t"
511 "and.b32 t,b,0xffff0000;\n\t"
512 "set.le.s32.s32 u,s,t; \n\t"
513 "cvt.s32.s16 s,a; \n\t"
514 "cvt.s32.s16 t,b; \n\t"
515 "set.le.s32.s32 s,s,t; \n\t"
516 #if __CUDA_ARCH__ >= 200
517 "prmt.b32 r,s,u,0x7632; \n\t"
519 "and.b32 u,u,0xffff0000;\n\t"
520 "and.b32 s,s,0x0000ffff;\n\t"
525 :
"=r"(r) :
"r"(a),
"r"(b));
530 static __device__ __forceinline__
unsigned int vcmpleu2(
unsigned int a,
unsigned int b)
533 #if __CUDA_ARCH__ >= 300
535 asm (
"vset2.u32.u32.le %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
539 asm (
"not.b32 %0,%0;" :
"+r"(a));
541 #if __CUDA_ARCH__ >= 200
542 asm (
"prmt.b32 %0,%1,0,0xbb99;" :
"=r"(r) :
"r"(c));
544 asm (
"and.b32 %0,%0,0x80008000;" :
"+r"(c));
545 asm (
"shr.u32 %0,%1,15;" :
"=r"(r) :
"r"(c));
546 asm (
"sub.u32 %0,%1,%0;" :
"+r"(r) :
"r"(c));
547 asm (
"or.b32 %0,%1,%0;" :
"+r"(r) :
"r"(c));
553 static __device__ __forceinline__
unsigned int vcmplts2(
unsigned int a,
unsigned int b)
556 #if __CUDA_ARCH__ >= 300
558 asm (
"vset2.s32.s32.lt %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
563 ".reg .u32 a, b, r, s, t, u; \n\t"
566 "and.b32 s,a,0xffff0000;\n\t"
567 "and.b32 t,b,0xffff0000;\n\t"
568 "set.lt.s32.s32 u,s,t; \n\t"
569 "cvt.s32.s16 s,a; \n\t"
570 "cvt.s32.s16 t,b; \n\t"
571 "set.lt.s32.s32 s,s,t; \n\t"
572 #if __CUDA_ARCH__ >= 200
573 "prmt.b32 r,s,u,0x7632; \n\t"
575 "and.b32 u,u,0xffff0000;\n\t"
576 "and.b32 s,s,0x0000ffff;\n\t"
581 :
"=r"(r) :
"r"(a),
"r"(b));
586 static __device__ __forceinline__
unsigned int vcmpltu2(
unsigned int a,
unsigned int b)
589 #if __CUDA_ARCH__ >= 300
591 asm (
"vset2.u32.u32.lt %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
595 asm (
"not.b32 %0,%0;" :
"+r"(a));
597 #if __CUDA_ARCH__ >= 200
598 asm (
"prmt.b32 %0,%1,0,0xbb99;" :
"=r"(r) :
"r"(c));
600 asm (
"and.b32 %0,%0,0x80008000;" :
"+r"(c));
601 asm (
"shr.u32 %0,%1,15;" :
"=r"(r) :
"r"(c));
602 asm (
"sub.u32 %0,%1,%0;" :
"+r"(r) :
"r"(c));
603 asm (
"or.b32 %0,%1,%0;" :
"+r"(r) :
"r"(c));
609 static __device__ __forceinline__
unsigned int vcmpne2(
unsigned int a,
unsigned int b)
612 #if __CUDA_ARCH__ >= 300
614 asm (
"vset2.u32.u32.ne %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
624 #if __CUDA_ARCH__ >= 200
625 asm (
"prmt.b32 %0,%1,0,0xbb99;" :
"=r"(r) :
"r"(c));
627 asm (
"and.b32 %0,%0,0x80008000;" :
"+r"(c));
628 asm (
"shr.u32 %0,%1,15;" :
"=r"(r) :
"r"(c));
629 asm (
"sub.u32 %0,%1,%0;" :
"+r"(r) :
"r"(c));
630 asm (
"or.b32 %0,%1,%0;" :
"+r"(r) :
"r"(c));
636 static __device__ __forceinline__
unsigned int vabsdiffu2(
unsigned int a,
unsigned int b)
639 #if __CUDA_ARCH__ >= 300
641 asm (
"vabsdiff2.u32.u32.u32 %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(s));
643 unsigned int t, u, v;
659 static __device__ __forceinline__
unsigned int vmaxs2(
unsigned int a,
unsigned int b)
662 #if __CUDA_ARCH__ >= 300
664 asm (
"vmax2.s32.s32.s32 %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(s));
667 asm (
"cvt.s32.s16 %0,%1;" :
"=r"(r) :
"r"(a));
668 asm (
"cvt.s32.s16 %0,%1;" :
"=r"(s) :
"r"(b));
669 t =
max((
int)r,(
int)s);
672 u =
max((
int)r,(
int)s);
673 r = u | (t & 0xffff);
678 static __device__ __forceinline__
unsigned int vmaxu2(
unsigned int a,
unsigned int b)
681 #if __CUDA_ARCH__ >= 300
683 asm (
"vmax2.u32.u32.u32 %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(s));
697 static __device__ __forceinline__
unsigned int vmins2(
unsigned int a,
unsigned int b)
700 #if __CUDA_ARCH__ >= 300
702 asm (
"vmin2.s32.s32.s32 %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(s));
705 asm (
"cvt.s32.s16 %0,%1;" :
"=r"(r) :
"r"(a));
706 asm (
"cvt.s32.s16 %0,%1;" :
"=r"(s) :
"r"(b));
707 t =
min((
int)r,(
int)s);
710 u =
min((
int)r,(
int)s);
711 r = u | (t & 0xffff);
716 static __device__ __forceinline__
unsigned int vminu2(
unsigned int a,
unsigned int b)
719 #if __CUDA_ARCH__ >= 300
721 asm (
"vmin2.u32.u32.u32 %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(s));
735 static __device__ __forceinline__
unsigned int vseteq2(
unsigned int a,
unsigned int b)
738 #if __CUDA_ARCH__ >= 300
740 asm (
"vset2.u32.u32.eq %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
754 static __device__ __forceinline__
unsigned int vsetges2(
unsigned int a,
unsigned int b)
757 #if __CUDA_ARCH__ >= 300
759 asm (
"vset2.s32.s32.ge %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
762 ".reg .u32 a, b, r, s, t, u; \n\t"
765 "and.b32 s,a,0xffff0000;\n\t"
766 "and.b32 t,b,0xffff0000;\n\t"
767 "set.ge.s32.s32 u,s,t; \n\t"
768 "cvt.s32.s16 s,a; \n\t"
769 "cvt.s32.s16 t,b; \n\t"
770 "set.ge.s32.s32 s,s,t; \n\t"
771 #if __CUDA_ARCH__ >= 200
772 "prmt.b32 r,s,u,0x7632; \n\t"
773 "and.b32 r,r,0x00010001;\n\t"
775 "and.b32 u,u,0x00010000;\n\t"
776 "and.b32 s,s,0x00000001;\n\t"
781 :
"=r"(r) :
"r"(a),
"r"(b));
786 static __device__ __forceinline__
unsigned int vsetgeu2(
unsigned int a,
unsigned int b)
789 #if __CUDA_ARCH__ >= 300
791 asm (
"vset2.u32.u32.ge %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
793 asm (
"not.b32 %0,%0;" :
"+r"(b));
801 static __device__ __forceinline__
unsigned int vsetgts2(
unsigned int a,
unsigned int b)
804 #if __CUDA_ARCH__ >= 300
806 asm (
"vset2.s32.s32.gt %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
809 ".reg .u32 a, b, r, s, t, u; \n\t"
812 "and.b32 s,a,0xffff0000;\n\t"
813 "and.b32 t,b,0xffff0000;\n\t"
814 "set.gt.s32.s32 u,s,t; \n\t"
815 "cvt.s32.s16 s,a; \n\t"
816 "cvt.s32.s16 t,b; \n\t"
817 "set.gt.s32.s32 s,s,t; \n\t"
818 #if __CUDA_ARCH__ >= 200
819 "prmt.b32 r,s,u,0x7632; \n\t"
820 "and.b32 r,r,0x00010001;\n\t"
822 "and.b32 u,u,0x00010000;\n\t"
823 "and.b32 s,s,0x00000001;\n\t"
828 :
"=r"(r) :
"r"(a),
"r"(b));
833 static __device__ __forceinline__
unsigned int vsetgtu2(
unsigned int a,
unsigned int b)
836 #if __CUDA_ARCH__ >= 300
838 asm (
"vset2.u32.u32.gt %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
840 asm (
"not.b32 %0,%0;" :
"+r"(b));
848 static __device__ __forceinline__
unsigned int vsetles2(
unsigned int a,
unsigned int b)
851 #if __CUDA_ARCH__ >= 300
853 asm (
"vset2.s32.s32.le %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
856 ".reg .u32 a, b, r, s, t, u; \n\t"
859 "and.b32 s,a,0xffff0000;\n\t"
860 "and.b32 t,b,0xffff0000;\n\t"
861 "set.le.s32.s32 u,s,t; \n\t"
862 "cvt.s32.s16 s,a; \n\t"
863 "cvt.s32.s16 t,b; \n\t"
864 "set.le.s32.s32 s,s,t; \n\t"
865 #if __CUDA_ARCH__ >= 200
866 "prmt.b32 r,s,u,0x7632; \n\t"
867 "and.b32 r,r,0x00010001;\n\t"
869 "and.b32 u,u,0x00010000;\n\t"
870 "and.b32 s,s,0x00000001;\n\t"
875 :
"=r"(r) :
"r"(a),
"r"(b));
880 static __device__ __forceinline__
unsigned int vsetleu2(
unsigned int a,
unsigned int b)
883 #if __CUDA_ARCH__ >= 300
885 asm (
"vset2.u32.u32.le %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
887 asm (
"not.b32 %0,%0;" :
"+r"(a));
895 static __device__ __forceinline__
unsigned int vsetlts2(
unsigned int a,
unsigned int b)
898 #if __CUDA_ARCH__ >= 300
900 asm (
"vset2.s32.s32.lt %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
903 ".reg .u32 a, b, r, s, t, u; \n\t"
906 "and.b32 s,a,0xffff0000;\n\t"
907 "and.b32 t,b,0xffff0000;\n\t"
908 "set.lt.s32.s32 u,s,t; \n\t"
909 "cvt.s32.s16 s,a; \n\t"
910 "cvt.s32.s16 t,b; \n\t"
911 "set.lt.s32.s32 s,s,t; \n\t"
912 #if __CUDA_ARCH__ >= 200
913 "prmt.b32 r,s,u,0x7632; \n\t"
914 "and.b32 r,r,0x00010001;\n\t"
916 "and.b32 u,u,0x00010000;\n\t"
917 "and.b32 s,s,0x00000001;\n\t"
922 :
"=r"(r) :
"r"(a),
"r"(b));
927 static __device__ __forceinline__
unsigned int vsetltu2(
unsigned int a,
unsigned int b)
930 #if __CUDA_ARCH__ >= 300
932 asm (
"vset2.u32.u32.lt %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
934 asm (
"not.b32 %0,%0;" :
"+r"(a));
942 static __device__ __forceinline__
unsigned int vsetne2(
unsigned int a,
unsigned int b)
945 #if __CUDA_ARCH__ >= 300
947 asm (
"vset2.u32.u32.ne %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
961 static __device__ __forceinline__
unsigned int vsadu2(
unsigned int a,
unsigned int b)
964 #if __CUDA_ARCH__ >= 300
966 asm(
"vabsdiff2.u32.u32.u32.add %0,%1,%2,%3;":
"=r"(r):
"r"(a),
"r"(b),
"r"(s));
968 unsigned int t, u, v;
979 #if __CUDA_ARCH__ < 350
980 asm (
"shr.u32 %0,%0,16;" :
"+r"(t));
982 asm (
"shf.r.clamp.b32 %0,%0,0,16;" :
"+r"(t));
989 static __device__ __forceinline__
unsigned int vsub2(
unsigned int a,
unsigned int b)
992 #if __CUDA_ARCH__ >= 300
994 asm (
"vsub2.u32.u32.u32 %0,%1,%2,%3;" :
"=r"(t) :
"r"(a),
"r"(b),
"r"(s));
1005 static __device__ __forceinline__
unsigned int vsubss2 (
unsigned int a,
unsigned int b)
1008 #if __CUDA_ARCH__ >= 300
1010 asm (
"vsub2.s32.s32.s32.sat %0,%1,%2,%3;" :
"=r"(r):
"r"(a),
"r"(b),
"r"(c));
1012 int ahi, alo, blo, bhi, rhi, rlo;
1013 ahi = (int)((a & 0xffff0000U));
1014 bhi = (int)((b & 0xffff0000U));
1015 #if __CUDA_ARCH__ < 200
1016 alo = (int)(a << 16);
1017 blo = (int)(b << 16);
1018 #elif __CUDA_ARCH__ < 350
1020 asm (
"prmt.b32 %0,%1,0,0x1044;" :
"=r"(alo) :
"r"(a));
1021 asm (
"prmt.b32 %0,%1,0,0x1044;" :
"=r"(blo) :
"r"(b));
1023 asm (
"shf.l.clamp.b32 %0,0,%1,16;" :
"=r"(alo) :
"r"(a));
1024 asm (
"shf.l.clamp.b32 %0,0,%1,16;" :
"=r"(blo) :
"r"(b));
1026 asm (
"sub.sat.s32 %0,%1,%2;" :
"=r"(rlo) :
"r"(alo),
"r"(blo));
1027 asm (
"sub.sat.s32 %0,%1,%2;" :
"=r"(rhi) :
"r"(ahi),
"r"(bhi));
1028 #if __CUDA_ARCH__ < 200
1029 r = ((
unsigned int)rhi & 0xffff0000U) | ((
unsigned int)rlo >> 16);
1031 asm (
"prmt.b32 %0,%1,%2,0x7632;" :
"=r"(r) :
"r"(rlo),
"r"(rhi));
1037 static __device__ __forceinline__
unsigned int vsubus2 (
unsigned int a,
unsigned int b)
1040 #if __CUDA_ARCH__ >= 300
1042 asm (
"vsub2.u32.u32.u32.sat %0,%1,%2,%3;" :
"=r"(r):
"r"(a),
"r"(b),
"r"(c));
1044 int alo, blo, rlo, ahi, bhi, rhi;
1046 "and.b32 %0, %4, 0xffff; \n\t"
1047 "and.b32 %1, %5, 0xffff; \n\t"
1048 #if __CUDA_ARCH__ < 350
1049 "shr.u32 %2, %4, 16; \n\t"
1050 "shr.u32 %3, %5, 16; \n\t"
1052 "shf.r.clamp.b32 %2, %4, 0, 16;\n\t"
1053 "shf.r.clamp.b32 %3, %5, 0, 16;\n\t"
1056 :
"=r"(alo),
"=r"(blo),
"=r"(ahi),
"=r"(bhi)
1058 rlo =
max ((
int)(alo - blo), 0);
1059 rhi =
max ((
int)(ahi - bhi), 0);
1060 r = rhi * 65536 + rlo;
1065 static __device__ __forceinline__
unsigned int vneg2(
unsigned int a)
1067 return vsub2 (0, a);
1070 static __device__ __forceinline__
unsigned int vnegss2(
unsigned int a)
1072 return vsubss2(0,a);
1075 static __device__ __forceinline__
unsigned int vabsdiffs2(
unsigned int a,
unsigned int b)
1078 #if __CUDA_ARCH__ >= 300
1080 asm (
"vabsdiff2.s32.s32.s32 %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(s));
1082 s = vcmpges2 (a, b);
1091 static __device__ __forceinline__
unsigned int vsads2(
unsigned int a,
unsigned int b)
1094 #if __CUDA_ARCH__ >= 300
1096 asm(
"vabsdiff2.s32.s32.s32.add %0,%1,%2,%3;":
"=r"(r):
"r"(a),
"r"(b),
"r"(s));
1098 s = vabsdiffs2 (a, b);
1099 r = (s >> 16) + (s & 0x0000ffff);
1104 static __device__ __forceinline__
unsigned int vabs4(
unsigned int a)
1107 #if __CUDA_ARCH__ >= 300
1108 unsigned int b = 0, c = 0;
1109 asm (
"vabsdiff4.s32.s32.s32 %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
1110 #elif __CUDA_ARCH__ >= 200
1112 ".reg .u32 a,m,r; \n\t"
1113 "mov.b32 a,%1; \n\t"
1114 "prmt.b32 m,a,0,0xba98; \n\t"
1115 "xor.b32 r,a,m; \n\t"
1116 "and.b32 m,m,0x01010101;\n\t"
1117 "add.u32 r,r,m; \n\t"
1118 "mov.b32 %0,r; \n\t"
1120 :
"=r"(r) :
"r"(a));
1123 ".reg .u32 a,m,r,s; \n\t"
1124 "mov.b32 a,%1; \n\t"
1125 "and.b32 m,a,0x80808080;\n\t"
1126 "and.b32 r,a,0x7f7f7f7f;\n\t"
1127 "shr.u32 s,m,7; \n\t"
1128 "sub.u32 m,m,s; \n\t"
1129 "xor.b32 r,r,m; \n\t"
1130 "add.u32 r,r,s; \n\t"
1131 "mov.b32 %0,r; \n\t"
1133 :
"=r"(r) :
"r"(a));
1138 static __device__ __forceinline__
unsigned int vabsss4(
unsigned int a)
1141 #if __CUDA_ARCH__ >= 300
1142 unsigned int b = 0, c = 0;
1143 asm(
"vabsdiff4.s32.s32.s32.sat %0,%1,%2,%3;":
"=r"(r):
"r"(a),
"r"(b),
"r"(c));
1144 #elif __CUDA_ARCH__ >= 200
1146 ".reg .u32 a,m,r; \n\t"
1147 "mov.b32 a,%1; \n\t"
1148 "prmt.b32 m,a,0,0xba98; \n\t"
1149 "xor.b32 r,a,m; \n\t"
1150 "and.b32 m,m,0x01010101;\n\t"
1151 "add.u32 r,r,m; \n\t"
1152 "prmt.b32 m,r,0,0xba98; \n\t"
1153 "and.b32 m,m,0x01010101;\n\t"
1154 "sub.u32 r,r,m; \n\t"
1155 "mov.b32 %0,r; \n\t"
1157 :
"=r"(r) :
"r"(a));
1160 ".reg .u32 a,m,r,s; \n\t"
1161 "mov.b32 a,%1; \n\t"
1162 "and.b32 m,a,0x80808080;\n\t"
1163 "and.b32 r,a,0x7f7f7f7f;\n\t"
1164 "shr.u32 s,m,7; \n\t"
1165 "sub.u32 m,m,s; \n\t"
1166 "xor.b32 r,r,m; \n\t"
1167 "add.u32 r,r,s; \n\t"
1168 "and.b32 m,r,0x80808080;\n\t"
1169 "shr.u32 s,m,7; \n\t"
1170 "sub.u32 r,r,s; \n\t"
1171 "mov.b32 %0,r; \n\t"
1173 :
"=r"(r) :
"r"(a));
1178 static __device__ __forceinline__
unsigned int vadd4(
unsigned int a,
unsigned int b)
1180 #if __CUDA_ARCH__ >= 300
1181 unsigned int r, c = 0;
1182 asm (
"vadd4.u32.u32.u32 %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
1184 unsigned int r, s, t;
1195 static __device__ __forceinline__
unsigned int vaddss4 (
unsigned int a,
unsigned int b)
1197 #if __CUDA_ARCH__ >= 300
1198 unsigned int r, c = 0;
1199 asm (
"vadd4.sat.s32.s32.s32 %0,%1,%2,%3;" :
"=r"(r):
"r"(a),
"r"(b),
"r"(c));
1235 ".reg .u32 a,b,r,s,t,u; \n\t"
1236 "mov.b32 a, %1; \n\t"
1237 "mov.b32 b, %2; \n\t"
1238 "and.b32 r, a, 0x7f7f7f7f;\n\t"
1239 "and.b32 t, b, 0x7f7f7f7f;\n\t"
1240 "xor.b32 s, a, b; \n\t"
1241 "add.u32 r, r, t; \n\t"
1242 "xor.b32 t, a, r; \n\t"
1243 "not.b32 u, s; \n\t"
1244 "and.b32 t, t, u; \n\t"
1245 "and.b32 s, s, 0x80808080;\n\t"
1246 "xor.b32 r, r, s; \n\t"
1247 #if __CUDA_ARCH__ >= 200
1248 "prmt.b32 s,a,0,0xba98; \n\t"
1249 "xor.b32 s,s,0x7f7f7f7f; \n\t"
1250 "prmt.b32 t,t,0,0xba98; \n\t"
1251 "and.b32 s,s,t; \n\t"
1253 "and.b32 r,r,t; \n\t"
1254 "or.b32 r,r,s; \n\t"
1256 "and.b32 t, t, 0x80808080;\n\t"
1257 "shr.u32 s, t, 7; \n\t"
1258 "not.b32 u, t; \n\t"
1259 "and.b32 r, r, u; \n\t"
1260 "and.b32 u, a, t; \n\t"
1261 "sub.u32 t, t, s; \n\t"
1262 "shr.u32 u, u, 7; \n\t"
1263 "or.b32 r, r, t; \n\t"
1264 "add.u32 r, r, u; \n\t"
1266 "mov.b32 %0, r; \n\t"
1268 :
"=r"(r) :
"r"(a),
"r"(b));
1273 static __device__ __forceinline__
unsigned int vaddus4 (
unsigned int a,
unsigned int b)
1275 #if __CUDA_ARCH__ >= 300
1276 unsigned int r, c = 0;
1277 asm (
"vadd4.u32.u32.u32.sat %0,%1,%2,%3;" :
"=r"(r):
"r"(a),
"r"(b),
"r"(c));
1312 ".reg .u32 a,b,r,s,t,m; \n\t"
1313 "mov.b32 a, %1; \n\t"
1314 "mov.b32 b, %2; \n\t"
1315 "or.b32 m, a, b; \n\t"
1316 "and.b32 r, a, 0x7f7f7f7f;\n\t"
1317 "and.b32 t, b, 0x7f7f7f7f;\n\t"
1318 "and.b32 m, m, 0x80808080;\n\t"
1319 "add.u32 r, r, t; \n\t"
1320 "and.b32 t, a, b; \n\t"
1321 "or.b32 t, t, r; \n\t"
1322 "or.b32 r, r, m; \n\t"
1323 "and.b32 t, t, m; \n\t"
1324 #if __CUDA_ARCH__ >= 200
1325 "prmt.b32 t, t, 0, 0xba98; \n\t"
1327 "shr.u32 s, t, 7; \n\t"
1328 "sub.u32 t, t, s; \n\t"
1330 "or.b32 r, r, t; \n\t"
1331 "mov.b32 %0, r; \n\t"
1333 :
"=r"(r) :
"r"(a),
"r"(b));
1338 static __device__ __forceinline__
unsigned int vavgs4(
unsigned int a,
unsigned int b)
1341 #if __CUDA_ARCH__ >= 300
1343 asm (
"vavrg4.s32.s32.s32 %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
1356 ".reg .u32 a,b,c,r,s,t,u,v;\n\t"
1357 "mov.b32 a,%1; \n\t"
1358 "mov.b32 b,%2; \n\t"
1359 "and.b32 u,a,0xfefefefe;\n\t"
1360 "and.b32 v,b,0xfefefefe;\n\t"
1361 "xor.b32 s,a,b; \n\t"
1362 "and.b32 t,a,b; \n\t"
1363 "shr.u32 u,u,1; \n\t"
1364 "shr.u32 v,v,1; \n\t"
1365 "and.b32 c,s,0x01010101;\n\t"
1366 "and.b32 s,s,0x80808080;\n\t"
1367 "and.b32 t,t,0x01010101;\n\t"
1368 "add.u32 r,u,v; \n\t"
1369 "add.u32 r,r,t; \n\t"
1370 "xor.b32 r,r,s; \n\t"
1371 "shr.u32 t,r,7; \n\t"
1373 "and.b32 t,t,c; \n\t"
1374 "add.u32 r,r,t; \n\t"
1375 "mov.b32 %0,r; \n\t"
1377 :
"=r"(r) :
"r"(a),
"r"(b));
1382 static __device__ __forceinline__
unsigned int vavgu4(
unsigned int a,
unsigned int b)
1385 #if __CUDA_ARCH__ >= 300
1387 asm (
"vavrg4.u32.u32.u32 %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
1400 static __device__ __forceinline__
unsigned int vhaddu4(
unsigned int a,
unsigned int b)
1413 static __device__ __forceinline__
unsigned int vcmpeq4(
unsigned int a,
unsigned int b)
1416 #if __CUDA_ARCH__ >= 300
1418 asm (
"vset4.u32.u32.eq %0,%1,%2,%3;" :
"=r"(c) :
"r"(a),
"r"(b),
"r"(r));
1429 #if __CUDA_ARCH__ >= 200
1430 asm (
"prmt.b32 %0,%1,0,0xba98;" :
"=r"(r) :
"r"(c));
1432 asm (
"shr.u32 %0,%1,7;" :
"=r"(r) :
"r"(c));
1433 asm (
"sub.u32 %0,%1,%0;" :
"+r"(r) :
"r"(c));
1434 asm (
"or.b32 %0,%1,%0;" :
"+r"(r) :
"r"(c));
1440 static __device__ __forceinline__
unsigned int vcmpges4(
unsigned int a,
unsigned int b)
1443 #if __CUDA_ARCH__ >= 300
1445 asm (
"vset4.s32.s32.ge %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
1450 ".reg .u32 a, b, r, s, t, u;\n\t"
1451 "mov.b32 a,%1; \n\t"
1452 "mov.b32 b,%2; \n\t"
1453 "xor.b32 s,a,b; \n\t"
1454 "or.b32 r,a,0x80808080;\n\t"
1455 "and.b32 t,b,0x7f7f7f7f;\n\t"
1456 "sub.u32 r,r,t; \n\t"
1457 "xor.b32 t,r,a; \n\t"
1458 "xor.b32 r,r,s; \n\t"
1459 "and.b32 t,t,s; \n\t"
1460 "xor.b32 t,t,r; \n\t"
1461 #if __CUDA_ARCH__ >= 200
1462 "prmt.b32 r,t,0,0xba98; \n\t"
1464 "and.b32 t,t,0x80808080;\n\t"
1465 "shr.u32 r,t,7; \n\t"
1466 "sub.u32 r,t,r; \n\t"
1467 "or.b32 r,r,t; \n\t"
1469 "mov.b32 %0,r; \n\t"
1471 :
"=r"(r) :
"r"(a),
"r"(b));
1476 static __device__ __forceinline__
unsigned int vcmpgeu4(
unsigned int a,
unsigned int b)
1479 #if __CUDA_ARCH__ >= 300
1481 asm (
"vset4.u32.u32.ge %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
1485 asm (
"not.b32 %0,%0;" :
"+r"(b));
1487 #if __CUDA_ARCH__ >= 200
1488 asm (
"prmt.b32 %0,%1,0,0xba98;" :
"=r"(r) :
"r"(c));
1490 asm (
"and.b32 %0,%0,0x80808080;" :
"+r"(c));
1491 asm (
"shr.u32 %0,%1,7;" :
"=r"(r) :
"r"(c));
1492 asm (
"sub.u32 %0,%1,%0;" :
"+r"(r) :
"r"(c));
1493 asm (
"or.b32 %0,%1,%0;" :
"+r"(r) :
"r"(c));
1499 static __device__ __forceinline__
unsigned int vcmpgts4(
unsigned int a,
unsigned int b)
1502 #if __CUDA_ARCH__ >= 300
1504 asm (
"vset4.s32.s32.gt %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
1510 ".reg .u32 a,b,r,s,t,u; \n\t"
1511 "mov.b32 a,%1; \n\t"
1512 "mov.b32 b,%2; \n\t"
1514 "and.b32 r,a,0x7f7f7f7f;\n\t"
1515 "and.b32 t,b,0x7f7f7f7f;\n\t"
1516 "xor.b32 s,a,b; \n\t"
1517 "add.u32 r,r,t; \n\t"
1518 "xor.b32 t,a,r; \n\t"
1520 "and.b32 t,t,u; \n\t"
1521 "xor.b32 r,r,u; \n\t"
1522 "xor.b32 t,t,r; \n\t"
1523 #if __CUDA_ARCH__ >= 200
1524 "prmt.b32 r,t,0,0xba98; \n\t"
1526 "and.b32 t,t,0x80808080;\n\t"
1527 "shr.u32 r,t,7; \n\t"
1528 "sub.u32 r,t,r; \n\t"
1529 "or.b32 r,r,t; \n\t"
1531 "mov.b32 %0,r; \n\t"
1533 :
"=r"(r) :
"r"(a),
"r"(b));
1538 static __device__ __forceinline__
unsigned int vcmpgtu4(
unsigned int a,
unsigned int b)
1541 #if __CUDA_ARCH__ >= 300
1543 asm (
"vset4.u32.u32.gt %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
1547 asm (
"not.b32 %0,%0;" :
"+r"(b));
1549 #if __CUDA_ARCH__ >= 200
1550 asm (
"prmt.b32 %0,%1,0,0xba98;" :
"=r"(r) :
"r"(c));
1552 asm (
"and.b32 %0,%0,0x80808080;" :
"+r"(c));
1553 asm (
"shr.u32 %0,%1,7;" :
"=r"(r) :
"r"(c));
1554 asm (
"sub.u32 %0,%1,%0;" :
"+r"(r) :
"r"(c));
1555 asm (
"or.b32 %0,%1,%0;" :
"+r"(r) :
"r"(c));
1561 static __device__ __forceinline__
unsigned int vcmples4(
unsigned int a,
unsigned int b)
1564 #if __CUDA_ARCH__ >= 300
1566 asm (
"vset4.s32.s32.le %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
1572 ".reg .u32 a,b,r,s,t,u; \n\t"
1573 "mov.b32 a,%1; \n\t"
1574 "mov.b32 b,%2; \n\t"
1576 "and.b32 r,a,0x7f7f7f7f;\n\t"
1577 "and.b32 t,u,0x7f7f7f7f;\n\t"
1578 "xor.b32 u,a,b; \n\t"
1579 "add.u32 r,r,t; \n\t"
1580 "xor.b32 t,a,r; \n\t"
1582 "and.b32 t,t,u; \n\t"
1583 "xor.b32 r,r,s; \n\t"
1584 "xor.b32 t,t,r; \n\t"
1585 #if __CUDA_ARCH__ >= 200
1586 "prmt.b32 r,t,0,0xba98; \n\t"
1588 "and.b32 t,t,0x80808080;\n\t"
1589 "shr.u32 r,t,7; \n\t"
1590 "sub.u32 r,t,r; \n\t"
1591 "or.b32 r,r,t; \n\t"
1593 "mov.b32 %0,r; \n\t"
1595 :
"=r"(r) :
"r"(a),
"r"(b));
1600 static __device__ __forceinline__
unsigned int vcmpleu4(
unsigned int a,
unsigned int b)
1603 #if __CUDA_ARCH__ >= 300
1605 asm (
"vset4.u32.u32.le %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
1609 asm (
"not.b32 %0,%0;" :
"+r"(a));
1611 #if __CUDA_ARCH__ >= 200
1612 asm (
"prmt.b32 %0,%1,0,0xba98;" :
"=r"(r) :
"r"(c));
1614 asm (
"and.b32 %0,%0,0x80808080;" :
"+r"(c));
1615 asm (
"shr.u32 %0,%1,7;" :
"=r"(r) :
"r"(c));
1616 asm (
"sub.u32 %0,%1,%0;" :
"+r"(r) :
"r"(c));
1617 asm (
"or.b32 %0,%1,%0;" :
"+r"(r) :
"r"(c));
1623 static __device__ __forceinline__
unsigned int vcmplts4(
unsigned int a,
unsigned int b)
1626 #if __CUDA_ARCH__ >= 300
1628 asm (
"vset4.s32.s32.lt %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
1633 ".reg .u32 a, b, r, s, t, u;\n\t"
1634 "mov.b32 a,%1; \n\t"
1635 "mov.b32 b,%2; \n\t"
1637 "xor.b32 s,u,a; \n\t"
1638 "or.b32 r,a,0x80808080;\n\t"
1639 "and.b32 t,b,0x7f7f7f7f;\n\t"
1640 "sub.u32 r,r,t; \n\t"
1641 "xor.b32 t,r,a; \n\t"
1643 "xor.b32 r,r,s; \n\t"
1644 "and.b32 t,t,u; \n\t"
1645 "xor.b32 t,t,r; \n\t"
1646 #if __CUDA_ARCH__ >= 200
1647 "prmt.b32 r,t,0,0xba98; \n\t"
1649 "and.b32 t,t,0x80808080;\n\t"
1650 "shr.u32 r,t,7; \n\t"
1651 "sub.u32 r,t,r; \n\t"
1652 "or.b32 r,r,t; \n\t"
1654 "mov.b32 %0,r; \n\t"
1656 :
"=r"(r) :
"r"(a),
"r"(b));
1661 static __device__ __forceinline__
unsigned int vcmpltu4(
unsigned int a,
unsigned int b)
1664 #if __CUDA_ARCH__ >= 300
1666 asm (
"vset4.u32.u32.lt %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
1670 asm (
"not.b32 %0,%0;" :
"+r"(a));
1672 #if __CUDA_ARCH__ >= 200
1673 asm (
"prmt.b32 %0,%1,0,0xba98;" :
"=r"(r) :
"r"(c));
1675 asm (
"and.b32 %0,%0,0x80808080;" :
"+r"(c));
1676 asm (
"shr.u32 %0,%1,7;" :
"=r"(r) :
"r"(c));
1677 asm (
"sub.u32 %0,%1,%0;" :
"+r"(r) :
"r"(c));
1678 asm (
"or.b32 %0,%1,%0;" :
"+r"(r) :
"r"(c));
1684 static __device__ __forceinline__
unsigned int vcmpne4(
unsigned int a,
unsigned int b)
1687 #if __CUDA_ARCH__ >= 300
1689 asm (
"vset4.u32.u32.ne %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
1699 #if __CUDA_ARCH__ >= 200
1700 asm (
"prmt.b32 %0,%1,0,0xba98;" :
"=r"(r) :
"r"(c));
1702 asm (
"and.b32 %0,%0,0x80808080;" :
"+r"(c));
1703 asm (
"shr.u32 %0,%1,7;" :
"=r"(r) :
"r"(c));
1704 asm (
"sub.u32 %0,%1,%0;" :
"+r"(r) :
"r"(c));
1705 asm (
"or.b32 %0,%1,%0;" :
"+r"(r) :
"r"(c));
1711 static __device__ __forceinline__
unsigned int vabsdiffu4(
unsigned int a,
unsigned int b)
1714 #if __CUDA_ARCH__ >= 300
1716 asm (
"vabsdiff4.u32.u32.u32 %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(s));
1718 s = vcmpgeu4 (a, b);
1727 static __device__ __forceinline__
unsigned int vmaxs4(
unsigned int a,
unsigned int b)
1730 #if __CUDA_ARCH__ >= 300
1732 asm (
"vmax4.s32.s32.s32 %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(s));
1734 s = vcmpges4 (a, b);
1742 static __device__ __forceinline__
unsigned int vmaxu4(
unsigned int a,
unsigned int b)
1745 #if __CUDA_ARCH__ >= 300
1747 asm (
"vmax4.u32.u32.u32 %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(s));
1749 s = vcmpgeu4 (a, b);
1757 static __device__ __forceinline__
unsigned int vmins4(
unsigned int a,
unsigned int b)
1760 #if __CUDA_ARCH__ >= 300
1762 asm (
"vmin4.s32.s32.s32 %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(s));
1764 s = vcmpges4 (b, a);
1772 static __device__ __forceinline__
unsigned int vminu4(
unsigned int a,
unsigned int b)
1775 #if __CUDA_ARCH__ >= 300
1777 asm (
"vmin4.u32.u32.u32 %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(s));
1779 s = vcmpgeu4 (b, a);
1786 static __device__ __forceinline__
unsigned int vseteq4(
unsigned int a,
unsigned int b)
1789 #if __CUDA_ARCH__ >= 300
1791 asm (
"vset4.u32.u32.eq %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
1805 static __device__ __forceinline__
unsigned int vsetles4(
unsigned int a,
unsigned int b)
1808 #if __CUDA_ARCH__ >= 300
1810 asm (
"vset4.s32.s32.le %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
1814 ".reg .u32 a,b,r,s,t,u; \n\t"
1815 "mov.b32 a,%1; \n\t"
1816 "mov.b32 b,%2; \n\t"
1818 "and.b32 r,a,0x7f7f7f7f;\n\t"
1819 "and.b32 t,u,0x7f7f7f7f;\n\t"
1820 "xor.b32 u,a,b; \n\t"
1821 "add.u32 r,r,t; \n\t"
1822 "xor.b32 t,a,r; \n\t"
1824 "and.b32 t,t,u; \n\t"
1825 "xor.b32 r,r,s; \n\t"
1826 "xor.b32 t,t,r; \n\t"
1827 "and.b32 t,t,0x80808080;\n\t"
1828 "shr.u32 r,t,7; \n\t"
1829 "mov.b32 %0,r; \n\t"
1831 :
"=r"(r) :
"r"(a),
"r"(b));
1836 static __device__ __forceinline__
unsigned int vsetleu4(
unsigned int a,
unsigned int b)
1839 #if __CUDA_ARCH__ >= 300
1841 asm (
"vset4.u32.u32.le %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
1843 asm (
"not.b32 %0,%0;" :
"+r"(a));
1851 static __device__ __forceinline__
unsigned int vsetlts4(
unsigned int a,
unsigned int b)
1854 #if __CUDA_ARCH__ >= 300
1856 asm (
"vset4.s32.s32.lt %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
1859 ".reg .u32 a, b, r, s, t, u;\n\t"
1860 "mov.b32 a,%1; \n\t"
1861 "mov.b32 b,%2; \n\t"
1863 "or.b32 r,a,0x80808080;\n\t"
1864 "and.b32 t,b,0x7f7f7f7f;\n\t"
1865 "xor.b32 s,u,a; \n\t"
1866 "sub.u32 r,r,t; \n\t"
1867 "xor.b32 t,r,a; \n\t"
1869 "xor.b32 r,r,s; \n\t"
1870 "and.b32 t,t,u; \n\t"
1871 "xor.b32 t,t,r; \n\t"
1872 "and.b32 t,t,0x80808080;\n\t"
1873 "shr.u32 r,t,7; \n\t"
1874 "mov.b32 %0,r; \n\t"
1876 :
"=r"(r) :
"r"(a),
"r"(b));
1881 static __device__ __forceinline__
unsigned int vsetltu4(
unsigned int a,
unsigned int b)
1884 #if __CUDA_ARCH__ >= 300
1886 asm (
"vset4.u32.u32.lt %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
1888 asm (
"not.b32 %0,%0;" :
"+r"(a));
1896 static __device__ __forceinline__
unsigned int vsetges4(
unsigned int a,
unsigned int b)
1899 #if __CUDA_ARCH__ >= 300
1901 asm (
"vset4.s32.s32.ge %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
1904 ".reg .u32 a, b, r, s, t, u;\n\t"
1905 "mov.b32 a,%1; \n\t"
1906 "mov.b32 b,%2; \n\t"
1907 "xor.b32 s,a,b; \n\t"
1908 "or.b32 r,a,0x80808080;\n\t"
1909 "and.b32 t,b,0x7f7f7f7f;\n\t"
1910 "sub.u32 r,r,t; \n\t"
1911 "xor.b32 t,r,a; \n\t"
1912 "xor.b32 r,r,s; \n\t"
1913 "and.b32 t,t,s; \n\t"
1914 "xor.b32 t,t,r; \n\t"
1915 "and.b32 t,t,0x80808080;\n\t"
1916 "shr.u32 r,t,7; \n\t"
1917 "mov.b32 %0,r; \n\t"
1919 :
"=r"(r) :
"r"(a),
"r"(b));
1924 static __device__ __forceinline__
unsigned int vsetgeu4(
unsigned int a,
unsigned int b)
1927 #if __CUDA_ARCH__ >= 300
1929 asm (
"vset4.u32.u32.ge %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
1931 asm (
"not.b32 %0,%0;" :
"+r"(b));
1939 static __device__ __forceinline__
unsigned int vsetgts4(
unsigned int a,
unsigned int b)
1942 #if __CUDA_ARCH__ >= 300
1944 asm (
"vset4.s32.s32.gt %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
1948 ".reg .u32 a,b,r,s,t,u; \n\t"
1949 "mov.b32 a,%1; \n\t"
1950 "mov.b32 b,%2; \n\t"
1952 "and.b32 r,a,0x7f7f7f7f;\n\t"
1953 "and.b32 t,b,0x7f7f7f7f;\n\t"
1954 "xor.b32 s,a,b; \n\t"
1955 "add.u32 r,r,t; \n\t"
1956 "xor.b32 t,a,r; \n\t"
1958 "and.b32 t,t,u; \n\t"
1959 "xor.b32 r,r,u; \n\t"
1960 "xor.b32 t,t,r; \n\t"
1961 "and.b32 t,t,0x80808080;\n\t"
1962 "shr.u32 r,t,7; \n\t"
1963 "mov.b32 %0,r; \n\t"
1965 :
"=r"(r) :
"r"(a),
"r"(b));
1970 static __device__ __forceinline__
unsigned int vsetgtu4(
unsigned int a,
unsigned int b)
1973 #if __CUDA_ARCH__ >= 300
1975 asm (
"vset4.u32.u32.gt %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
1977 asm (
"not.b32 %0,%0;" :
"+r"(b));
1985 static __device__ __forceinline__
unsigned int vsetne4(
unsigned int a,
unsigned int b)
1988 #if __CUDA_ARCH__ >= 300
1990 asm (
"vset4.u32.u32.ne %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
2004 static __device__ __forceinline__
unsigned int vsadu4(
unsigned int a,
unsigned int b)
2007 #if __CUDA_ARCH__ >= 300
2009 asm(
"vabsdiff4.u32.u32.u32.add %0,%1,%2,%3;":
"=r"(r):
"r"(a),
"r"(b),
"r"(s));
2011 r = vabsdiffu4 (a, b);
2013 r = (r & 0x00ff00ff) + (s & 0x00ff00ff);
2014 r = ((r << 16) + r) >> 16;
2019 static __device__ __forceinline__
unsigned int vsub4(
unsigned int a,
unsigned int b)
2021 #if __CUDA_ARCH__ >= 300
2022 unsigned int r, c = 0;
2023 asm (
"vsub4.u32.u32.u32 %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
2025 unsigned int r, s, t;
2036 static __device__ __forceinline__
unsigned int vsubss4(
unsigned int a,
unsigned int b)
2039 #if __CUDA_ARCH__ >= 300
2041 asm (
"vsub4.s32.s32.s32.sat %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
2076 ".reg .u32 a,b,r,s,t,u,v,w; \n\t"
2077 "mov.b32 a,%1; \n\t"
2078 "mov.b32 b,%2; \n\t"
2080 "xor.b32 s,u,a; \n\t"
2081 "or.b32 r,a,0x80808080;\n\t"
2082 "and.b32 t,b,0x7f7f7f7f;\n\t"
2083 "sub.u32 r,r,t; \n\t"
2084 "xor.b32 t,r,a; \n\t"
2086 "and.b32 s,s,0x80808080;\n\t"
2087 "xor.b32 r,r,s; \n\t"
2088 "and.b32 t,t,u; \n\t"
2089 #if __CUDA_ARCH__ >= 200
2090 "prmt.b32 s,a,0,0xba98; \n\t"
2091 "xor.b32 s,s,0x7f7f7f7f;\n\t"
2092 "prmt.b32 t,t,0,0xba98; \n\t"
2093 "and.b32 s,s,t; \n\t"
2095 "and.b32 r,r,t; \n\t"
2096 "or.b32 r,r,s; \n\t"
2098 "and.b32 t,t,0x80808080;\n\t"
2099 "shr.u32 s,t,7; \n\t"
2101 "and.b32 r,r,u; \n\t"
2102 "and.b32 u,a,t; \n\t"
2103 "sub.u32 t,t,s; \n\t"
2104 "shr.u32 u,u,7; \n\t"
2105 "or.b32 r,r,t; \n\t"
2106 "add.u32 r,r,u; \n\t"
2108 "mov.b32 %0,r; \n\t"
2110 :
"=r"(r) :
"r"(a),
"r"(b));
2115 static __device__ __forceinline__
unsigned int vsubus4(
unsigned int a,
unsigned int b)
2118 #if __CUDA_ARCH__ >= 300
2120 asm (
"vsub4.u32.u32.u32.sat %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
2148 ".reg .u32 a,b,r,s,t,u; \n\t"
2149 "mov.b32 a,%1; \n\t"
2150 "mov.b32 b,%2; \n\t"
2152 "xor.b32 s,u,a; \n\t"
2153 "and.b32 u,u,a; \n\t"
2154 "or.b32 r,a,0x80808080;\n\t"
2155 "and.b32 t,b,0x7f7f7f7f;\n\t"
2156 "sub.u32 r,r,t; \n\t"
2157 "and.b32 t,r,s; \n\t"
2158 "and.b32 s,s,0x80808080;\n\t"
2159 "xor.b32 r,r,s; \n\t"
2160 "or.b32 t,t,u; \n\t"
2161 #if __CUDA_ARCH__ >= 200
2162 "prmt.b32 t,t,0,0xba98; \n\t"
2164 "and.b32 t,t,0x80808080;\n\t"
2165 "shr.u32 s,t,7; \n\t"
2166 "sub.u32 s,t,s; \n\t"
2167 "or.b32 t,t,s; \n\t"
2169 "and.b32 r,r,t; \n\t"
2170 "mov.b32 %0,r; \n\t"
2172 :
"=r"(r) :
"r"(a) ,
"r"(b));
2177 static __device__ __forceinline__
unsigned int vneg4(
unsigned int a)
2179 return vsub4 (0, a);
2182 static __device__ __forceinline__
unsigned int vnegss4(
unsigned int a)
2185 #if __CUDA_ARCH__ >= 300
2187 asm (
"vsub4.s32.s32.s32.sat %0,%1,%2,%3;" :
"=r"(r) :
"r"(s),
"r"(a),
"r"(s));
2191 ".reg .u32 a, r, s; \n\t"
2192 "mov.b32 r,%0; \n\t"
2193 "mov.b32 a,%1; \n\t"
2194 "and.b32 a,a,0x80808080;\n\t"
2195 "and.b32 s,a,r; \n\t"
2196 "shr.u32 s,s,7; \n\t"
2197 "sub.u32 r,r,s; \n\t"
2198 "mov.b32 %0,r; \n\t"
2200 :
"+r"(r) :
"r"(a));
2205 static __device__ __forceinline__
unsigned int vabsdiffs4(
unsigned int a,
unsigned int b)
2208 #if __CUDA_ARCH__ >= 300
2210 asm (
"vabsdiff4.s32.s32.s32 %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(s));
2212 s = vcmpges4 (a, b);
2221 static __device__ __forceinline__
unsigned int vsads4(
unsigned int a,
unsigned int b)
2224 #if __CUDA_ARCH__ >= 300
2226 asm(
"vabsdiff4.s32.s32.s32.add %0,%1,%2,%3;":
"=r"(r):
"r"(a),
"r"(b),
"r"(s));
2228 r = vabsdiffs4 (a, b);
2230 r = (r & 0x00ff00ff) + (s & 0x00ff00ff);
2231 r = ((r << 16) + r) >> 16;