28 #if !defined (SIMD_FUNCTIONS_H__)
29 #define SIMD_FUNCTIONS_H__
85 static __device__ __forceinline__
unsigned int vadd2(
unsigned int a,
unsigned int b)
88 #if __CUDA_ARCH__ >= 300
90 asm (
"vadd2.u32.u32.u32 %0,%1,%2,%3;" :
"=r"(t) :
"r"(a),
"r"(b),
"r"(s));
101 static __device__ __forceinline__
unsigned int vaddss2 (
unsigned int a,
unsigned int b)
104 #if __CUDA_ARCH__ >= 300
106 asm (
"vadd2.s32.s32.s32.sat %0,%1,%2,%3;" :
"=r"(r):
"r"(a),
"r"(b),
"r"(c));
108 int ahi, alo, blo, bhi, rhi, rlo;
109 ahi = (int)((a & 0xffff0000U));
110 bhi = (int)((b & 0xffff0000U));
111 #if __CUDA_ARCH__ < 200
112 alo = (int)(a << 16);
113 blo = (int)(b << 16);
114 #elif __CUDA_ARCH__ < 350
116 asm (
"prmt.b32 %0,%1,0,0x1044;" :
"=r"(alo) :
"r"(a));
117 asm (
"prmt.b32 %0,%1,0,0x1044;" :
"=r"(blo) :
"r"(b));
119 asm (
"shf.l.clamp.b32 %0,0,%1,16;" :
"=r"(alo) :
"r"(a));
120 asm (
"shf.l.clamp.b32 %0,0,%1,16;" :
"=r"(blo) :
"r"(b));
122 asm (
"add.sat.s32 %0,%1,%2;" :
"=r"(rlo) :
"r"(alo),
"r"(blo));
123 asm (
"add.sat.s32 %0,%1,%2;" :
"=r"(rhi) :
"r"(ahi),
"r"(bhi));
124 #if __CUDA_ARCH__ < 200
125 r = ((
unsigned int)rhi & 0xffff0000U) | ((
unsigned int)rlo >> 16);
127 asm (
"prmt.b32 %0,%1,%2,0x7632;" :
"=r"(r) :
"r"(rlo),
"r"(rhi));
133 static __device__ __forceinline__
unsigned int vaddus2 (
unsigned int a,
unsigned int b)
136 #if __CUDA_ARCH__ >= 300
138 asm (
"vadd2.u32.u32.u32.sat %0,%1,%2,%3;" :
"=r"(r):
"r"(a),
"r"(b),
"r"(c));
140 int alo, blo, rlo, ahi, bhi, rhi;
142 "and.b32 %0, %4, 0xffff; \n\t"
143 "and.b32 %1, %5, 0xffff; \n\t"
144 #if __CUDA_ARCH__ < 350
145 "shr.u32 %2, %4, 16; \n\t"
146 "shr.u32 %3, %5, 16; \n\t"
148 "shf.r.clamp.b32 %2, %4, 0, 16;\n\t"
149 "shf.r.clamp.b32 %3, %5, 0, 16;\n\t"
152 :
"=r"(alo),
"=r"(blo),
"=r"(ahi),
"=r"(bhi)
154 rlo =
::min (alo + blo, 65535);
155 rhi =
::min (ahi + bhi, 65535);
156 r = (rhi << 16) + rlo;
161 static __device__ __forceinline__
unsigned int vavgs2(
unsigned int a,
unsigned int b)
164 #if __CUDA_ARCH__ >= 300
166 asm (
"vavrg2.s32.s32.s32 %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
179 ".reg .u32 a,b,c,r,s,t,u,v;\n\t"
182 "and.b32 u,a,0xfffefffe;\n\t"
183 "and.b32 v,b,0xfffefffe;\n\t"
184 "xor.b32 s,a,b; \n\t"
185 "and.b32 t,a,b; \n\t"
186 "shr.u32 u,u,1; \n\t"
187 "shr.u32 v,v,1; \n\t"
188 "and.b32 c,s,0x00010001;\n\t"
189 "and.b32 s,s,0x80008000;\n\t"
190 "and.b32 t,t,0x00010001;\n\t"
191 "add.u32 r,u,v; \n\t"
192 "add.u32 r,r,t; \n\t"
193 "xor.b32 r,r,s; \n\t"
194 "shr.u32 t,r,15; \n\t"
196 "and.b32 t,t,c; \n\t"
197 "add.u32 r,r,t; \n\t"
200 :
"=r"(r) :
"r"(a),
"r"(b));
205 static __device__ __forceinline__
unsigned int vavgu2(
unsigned int a,
unsigned int b)
208 #if __CUDA_ARCH__ >= 300
210 asm (
"vavrg2.u32.u32.u32 %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
223 static __device__ __forceinline__
unsigned int vhaddu2(
unsigned int a,
unsigned int b)
236 static __device__ __forceinline__
unsigned int vcmpeq2(
unsigned int a,
unsigned int b)
239 #if __CUDA_ARCH__ >= 300
241 asm (
"vset2.u32.u32.eq %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
259 static __device__ __forceinline__
unsigned int vcmpgeu2(
unsigned int a,
unsigned int b)
262 #if __CUDA_ARCH__ >= 300
264 asm (
"vset2.u32.u32.ge %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
268 asm (
"not.b32 %0,%0;" :
"+r"(b));
270 asm (
"and.b32 %0,%0,0x80008000;" :
"+r"(c));
271 asm (
"shr.u32 %0,%1,15;" :
"=r"(r) :
"r"(c));
272 asm (
"sub.u32 %0,%1,%0;" :
"+r"(r) :
"r"(c));
273 asm (
"or.b32 %0,%1,%0;" :
"+r"(r) :
"r"(c));
278 static __device__ __forceinline__
unsigned int vcmpgtu2(
unsigned int a,
unsigned int b)
281 #if __CUDA_ARCH__ >= 300
283 asm (
"vset2.u32.u32.gt %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
287 asm (
"not.b32 %0,%0;" :
"+r"(b));
289 asm (
"and.b32 %0,%0,0x80008000;" :
"+r"(c));
290 asm (
"shr.u32 %0,%1,15;" :
"=r"(r) :
"r"(c));
291 asm (
"sub.u32 %0,%1,%0;" :
"+r"(r) :
"r"(c));
292 asm (
"or.b32 %0,%1,%0;" :
"+r"(r) :
"r"(c));
297 static __device__ __forceinline__
unsigned int vcmpleu2(
unsigned int a,
unsigned int b)
300 #if __CUDA_ARCH__ >= 300
302 asm (
"vset2.u32.u32.le %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
306 asm (
"not.b32 %0,%0;" :
"+r"(a));
308 asm (
"and.b32 %0,%0,0x80008000;" :
"+r"(c));
309 asm (
"shr.u32 %0,%1,15;" :
"=r"(r) :
"r"(c));
310 asm (
"sub.u32 %0,%1,%0;" :
"+r"(r) :
"r"(c));
311 asm (
"or.b32 %0,%1,%0;" :
"+r"(r) :
"r"(c));
316 static __device__ __forceinline__
unsigned int vcmpltu2(
unsigned int a,
unsigned int b)
319 #if __CUDA_ARCH__ >= 300
321 asm (
"vset2.u32.u32.lt %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
325 asm (
"not.b32 %0,%0;" :
"+r"(a));
327 asm (
"and.b32 %0,%0,0x80008000;" :
"+r"(c));
328 asm (
"shr.u32 %0,%1,15;" :
"=r"(r) :
"r"(c));
329 asm (
"sub.u32 %0,%1,%0;" :
"+r"(r) :
"r"(c));
330 asm (
"or.b32 %0,%1,%0;" :
"+r"(r) :
"r"(c));
335 static __device__ __forceinline__
unsigned int vcmpne2(
unsigned int a,
unsigned int b)
338 #if __CUDA_ARCH__ >= 300
340 asm (
"vset2.u32.u32.ne %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
350 asm (
"and.b32 %0,%0,0x80008000;" :
"+r"(c));
351 asm (
"shr.u32 %0,%1,15;" :
"=r"(r) :
"r"(c));
352 asm (
"sub.u32 %0,%1,%0;" :
"+r"(r) :
"r"(c));
353 asm (
"or.b32 %0,%1,%0;" :
"+r"(r) :
"r"(c));
358 static __device__ __forceinline__
unsigned int vabsdiffu2(
unsigned int a,
unsigned int b)
361 #if __CUDA_ARCH__ >= 300
363 asm (
"vabsdiff2.u32.u32.u32 %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(s));
365 unsigned int t, u, v;
381 static __device__ __forceinline__
unsigned int vmaxu2(
unsigned int a,
unsigned int b)
384 #if __CUDA_ARCH__ >= 300
386 asm (
"vmax2.u32.u32.u32 %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(s));
400 static __device__ __forceinline__
unsigned int vminu2(
unsigned int a,
unsigned int b)
403 #if __CUDA_ARCH__ >= 300
405 asm (
"vmin2.u32.u32.u32 %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(s));
419 static __device__ __forceinline__
unsigned int vseteq2(
unsigned int a,
unsigned int b)
422 #if __CUDA_ARCH__ >= 300
424 asm (
"vset2.u32.u32.eq %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
438 static __device__ __forceinline__
unsigned int vsetgeu2(
unsigned int a,
unsigned int b)
441 #if __CUDA_ARCH__ >= 300
443 asm (
"vset2.u32.u32.ge %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
445 asm (
"not.b32 %0,%0;" :
"+r"(b));
453 static __device__ __forceinline__
unsigned int vsetgtu2(
unsigned int a,
unsigned int b)
456 #if __CUDA_ARCH__ >= 300
458 asm (
"vset2.u32.u32.gt %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
460 asm (
"not.b32 %0,%0;" :
"+r"(b));
468 static __device__ __forceinline__
unsigned int vsetleu2(
unsigned int a,
unsigned int b)
471 #if __CUDA_ARCH__ >= 300
473 asm (
"vset2.u32.u32.le %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
475 asm (
"not.b32 %0,%0;" :
"+r"(a));
483 static __device__ __forceinline__
unsigned int vsetltu2(
unsigned int a,
unsigned int b)
486 #if __CUDA_ARCH__ >= 300
488 asm (
"vset2.u32.u32.lt %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
490 asm (
"not.b32 %0,%0;" :
"+r"(a));
498 static __device__ __forceinline__
unsigned int vsetne2(
unsigned int a,
unsigned int b)
501 #if __CUDA_ARCH__ >= 300
503 asm (
"vset2.u32.u32.ne %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
517 static __device__ __forceinline__
unsigned int vsub2(
unsigned int a,
unsigned int b)
520 #if __CUDA_ARCH__ >= 300
522 asm (
"vsub2.u32.u32.u32 %0,%1,%2,%3;" :
"=r"(t) :
"r"(a),
"r"(b),
"r"(s));
533 static __device__ __forceinline__
unsigned int vadd4(
unsigned int a,
unsigned int b)
535 #if __CUDA_ARCH__ >= 300
536 unsigned int r, c = 0;
537 asm (
"vadd4.u32.u32.u32 %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
539 unsigned int r, s, t;
550 static __device__ __forceinline__
unsigned int vaddus4 (
unsigned int a,
unsigned int b)
552 #if __CUDA_ARCH__ >= 300
553 unsigned int r, c = 0;
554 asm (
"vadd4.u32.u32.u32.sat %0,%1,%2,%3;" :
"=r"(r):
"r"(a),
"r"(b),
"r"(c));
589 ".reg .u32 a,b,r,s,t,m; \n\t"
590 "mov.b32 a, %1; \n\t"
591 "mov.b32 b, %2; \n\t"
592 "or.b32 m, a, b; \n\t"
593 "and.b32 r, a, 0x7f7f7f7f;\n\t"
594 "and.b32 t, b, 0x7f7f7f7f;\n\t"
595 "and.b32 m, m, 0x80808080;\n\t"
596 "add.u32 r, r, t; \n\t"
597 "and.b32 t, a, b; \n\t"
598 "or.b32 t, t, r; \n\t"
599 "and.b32 t, t, m; \n\t"
600 "shr.u32 s, t, 7; \n\t"
601 "sub.u32 t, t, s; \n\t"
602 "or.b32 t, t, m; \n\t"
603 "or.b32 r, r, t; \n\t"
604 "mov.b32 %0, r; \n\t"
606 :
"=r"(r) :
"r"(a),
"r"(b));
611 static __device__ __forceinline__
unsigned int vaddss4 (
unsigned int a,
unsigned int b)
613 #if __CUDA_ARCH__ >= 300
614 unsigned int r, c = 0;
615 asm (
"vadd4.sat.s32.s32.s32 %0,%1,%2,%3;" :
"=r"(r):
"r"(a),
"r"(b),
"r"(c));
651 ".reg .u32 a,b,r,s,t,u; \n\t"
652 "mov.b32 a, %1; \n\t"
653 "mov.b32 b, %2; \n\t"
654 "xor.b32 s, a, b; \n\t"
655 "and.b32 r, a, 0x7f7f7f7f;\n\t"
656 "and.b32 t, b, 0x7f7f7f7f;\n\t"
657 "and.b32 s, s, 0x80808080;\n\t"
658 "add.u32 r, r, t; \n\t"
659 "xor.b32 t, a, r; \n\t"
660 "xor.b32 r, r, s; \n\t"
662 "and.b32 t, t, s; \n\t"
663 "and.b32 t, t, 0x80808080;\n\t"
664 "shr.u32 s, t, 7; \n\t"
666 "and.b32 r, r, u; \n\t"
667 "and.b32 u, a, t; \n\t"
668 "sub.u32 t, t, s; \n\t"
669 "shr.u32 u, u, 7; \n\t"
670 "or.b32 r, r, t; \n\t"
671 "add.u32 r, r, u; \n\t"
672 "mov.b32 %0, r; \n\t"
674 :
"=r"(r) :
"r"(a),
"r"(b));
679 static __device__ __forceinline__
unsigned int vavgs4(
unsigned int a,
unsigned int b)
682 #if __CUDA_ARCH__ >= 300
684 asm (
"vavrg4.s32.s32.s32 %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
697 ".reg .u32 a,b,c,r,s,t,u,v;\n\t"
700 "and.b32 u,a,0xfefefefe;\n\t"
701 "and.b32 v,b,0xfefefefe;\n\t"
702 "xor.b32 s,a,b; \n\t"
703 "and.b32 t,a,b; \n\t"
704 "shr.u32 u,u,1; \n\t"
705 "shr.u32 v,v,1; \n\t"
706 "and.b32 c,s,0x01010101;\n\t"
707 "and.b32 s,s,0x80808080;\n\t"
708 "and.b32 t,t,0x01010101;\n\t"
709 "add.u32 r,u,v; \n\t"
710 "add.u32 r,r,t; \n\t"
711 "xor.b32 r,r,s; \n\t"
712 "shr.u32 t,r,7; \n\t"
714 "and.b32 t,t,c; \n\t"
715 "add.u32 r,r,t; \n\t"
718 :
"=r"(r) :
"r"(a),
"r"(b));
723 static __device__ __forceinline__
unsigned int vavgu4(
unsigned int a,
unsigned int b)
726 #if __CUDA_ARCH__ >= 300
728 asm (
"vavrg4.u32.u32.u32 %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
741 static __device__ __forceinline__
unsigned int vhaddu4(
unsigned int a,
unsigned int b)
754 static __device__ __forceinline__
unsigned int vcmpeq4(
unsigned int a,
unsigned int b)
757 #if __CUDA_ARCH__ >= 300
759 asm (
"vset4.u32.u32.eq %0,%1,%2,%3;" :
"=r"(c) :
"r"(a),
"r"(b),
"r"(r));
777 static __device__ __forceinline__
unsigned int vcmpgeu4(
unsigned int a,
unsigned int b)
780 #if __CUDA_ARCH__ >= 300
782 asm (
"vset4.u32.u32.ge %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
786 asm (
"not.b32 %0,%0;" :
"+r"(b));
788 asm (
"and.b32 %0,%0,0x80808080;" :
"+r"(c));
789 asm (
"shr.u32 %0,%1,7;" :
"=r"(r) :
"r"(c));
790 asm (
"sub.u32 %0,%1,%0;" :
"+r"(r) :
"r"(c));
791 asm (
"or.b32 %0,%1,%0;" :
"+r"(r) :
"r"(c));
796 static __device__ __forceinline__
unsigned int vcmpgtu4(
unsigned int a,
unsigned int b)
799 #if __CUDA_ARCH__ >= 300
801 asm (
"vset4.u32.u32.gt %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
805 asm (
"not.b32 %0,%0;" :
"+r"(b));
807 asm (
"and.b32 %0,%0,0x80808080;" :
"+r"(c));
808 asm (
"shr.u32 %0,%1,7;" :
"=r"(r) :
"r"(c));
809 asm (
"sub.u32 %0,%1,%0;" :
"+r"(r) :
"r"(c));
810 asm (
"or.b32 %0,%1,%0;" :
"+r"(r) :
"r"(c));
815 static __device__ __forceinline__
unsigned int vcmpleu4(
unsigned int a,
unsigned int b)
818 #if __CUDA_ARCH__ >= 300
820 asm (
"vset4.u32.u32.le %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
824 asm (
"not.b32 %0,%0;" :
"+r"(a));
826 asm (
"and.b32 %0,%0,0x80808080;" :
"+r"(c));
827 asm (
"shr.u32 %0,%1,7;" :
"=r"(r) :
"r"(c));
828 asm (
"sub.u32 %0,%1,%0;" :
"+r"(r) :
"r"(c));
829 asm (
"or.b32 %0,%1,%0;" :
"+r"(r) :
"r"(c));
834 static __device__ __forceinline__
unsigned int vcmpltu4(
unsigned int a,
unsigned int b)
837 #if __CUDA_ARCH__ >= 300
839 asm (
"vset4.u32.u32.lt %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
843 asm (
"not.b32 %0,%0;" :
"+r"(a));
845 asm (
"and.b32 %0,%0,0x80808080;" :
"+r"(c));
846 asm (
"shr.u32 %0,%1,7;" :
"=r"(r) :
"r"(c));
847 asm (
"sub.u32 %0,%1,%0;" :
"+r"(r) :
"r"(c));
848 asm (
"or.b32 %0,%1,%0;" :
"+r"(r) :
"r"(c));
853 static __device__ __forceinline__
unsigned int vcmpne4(
unsigned int a,
unsigned int b)
856 #if __CUDA_ARCH__ >= 300
858 asm (
"vset4.u32.u32.ne %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
868 asm (
"and.b32 %0,%0,0x80808080;" :
"+r"(c));
869 asm (
"shr.u32 %0,%1,7;" :
"=r"(r) :
"r"(c));
870 asm (
"sub.u32 %0,%1,%0;" :
"+r"(r) :
"r"(c));
871 asm (
"or.b32 %0,%1,%0;" :
"+r"(r) :
"r"(c));
876 static __device__ __forceinline__
unsigned int vabsdiffu4(
unsigned int a,
unsigned int b)
879 #if __CUDA_ARCH__ >= 300
881 asm (
"vabsdiff4.u32.u32.u32 %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(s));
892 static __device__ __forceinline__
unsigned int vmaxu4(
unsigned int a,
unsigned int b)
895 #if __CUDA_ARCH__ >= 300
897 asm (
"vmax4.u32.u32.u32 %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(s));
907 static __device__ __forceinline__
unsigned int vminu4(
unsigned int a,
unsigned int b)
910 #if __CUDA_ARCH__ >= 300
912 asm (
"vmin4.u32.u32.u32 %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(s));
922 static __device__ __forceinline__
unsigned int vseteq4(
unsigned int a,
unsigned int b)
925 #if __CUDA_ARCH__ >= 300
927 asm (
"vset4.u32.u32.eq %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
941 static __device__ __forceinline__
unsigned int vsetleu4(
unsigned int a,
unsigned int b)
944 #if __CUDA_ARCH__ >= 300
946 asm (
"vset4.u32.u32.le %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
948 asm (
"not.b32 %0,%0;" :
"+r"(a));
956 static __device__ __forceinline__
unsigned int vsetltu4(
unsigned int a,
unsigned int b)
959 #if __CUDA_ARCH__ >= 300
961 asm (
"vset4.u32.u32.lt %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
963 asm (
"not.b32 %0,%0;" :
"+r"(a));
971 static __device__ __forceinline__
unsigned int vsetgeu4(
unsigned int a,
unsigned int b)
974 #if __CUDA_ARCH__ >= 300
976 asm (
"vset4.u32.u32.ge %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
978 asm (
"not.b32 %0,%0;" :
"+r"(b));
986 static __device__ __forceinline__
unsigned int vsetgtu4(
unsigned int a,
unsigned int b)
989 #if __CUDA_ARCH__ >= 300
991 asm (
"vset4.u32.u32.gt %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
993 asm (
"not.b32 %0,%0;" :
"+r"(b));
1001 static __device__ __forceinline__
unsigned int vsetne4(
unsigned int a,
unsigned int b)
1004 #if __CUDA_ARCH__ >= 300
1006 asm (
"vset4.u32.u32.ne %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
1020 static __device__ __forceinline__
unsigned int vsub4(
unsigned int a,
unsigned int b)
1022 #if __CUDA_ARCH__ >= 300
1023 unsigned int r, c = 0;
1024 asm (
"vsub4.u32.u32.u32 %0,%1,%2,%3;" :
"=r"(r) :
"r"(a),
"r"(b),
"r"(c));
1026 unsigned int r, s, t;