10 #ifndef EIGEN_PACKET_MATH_GPU_H
11 #define EIGEN_PACKET_MATH_GPU_H
14 #include "../../InternalHeaderCheck.h"
21 #if defined(EIGEN_HIP_DEVICE_COMPILE) || (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350)
22 #define EIGEN_GPU_HAS_LDG 1
26 #if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530)
27 #define EIGEN_CUDA_HAS_FP16_ARITHMETIC 1
30 #if defined(EIGEN_HIP_DEVICE_COMPILE) || defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
31 #define EIGEN_GPU_HAS_FP16_ARITHMETIC 1
37 #if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
40 struct is_arithmetic<float4> {
41 enum {
value =
true };
44 struct is_arithmetic<double2> {
45 enum {
value =
true };
49 struct packet_traits<float> : default_packet_traits {
82 struct packet_traits<
double> : default_packet_traits {
113 struct unpacket_traits<float4> {
125 struct unpacket_traits<double2> {
134 typedef double2
half;
139 return make_float4(from, from, from, from);
143 return make_double2(from, from);
149 #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
152 return __int_as_float(__float_as_int(
a) & __float_as_int(
b));
155 return __longlong_as_double(__double_as_longlong(
a) & __double_as_longlong(
b));
159 return __int_as_float(__float_as_int(
a) | __float_as_int(
b));
162 return __longlong_as_double(__double_as_longlong(
a) | __double_as_longlong(
b));
166 return __int_as_float(__float_as_int(
a) ^ __float_as_int(
b));
169 return __longlong_as_double(__double_as_longlong(
a) ^ __double_as_longlong(
b));
173 return __int_as_float(__float_as_int(
a) & ~__float_as_int(
b));
176 return __longlong_as_double(__double_as_longlong(
a) & ~__double_as_longlong(
b));
179 return __int_as_float(
a ==
b ? 0xffffffffu : 0u);
182 return __longlong_as_double(
a ==
b ? 0xffffffffffffffffull : 0ull);
186 return __int_as_float(
a <
b ? 0xffffffffu : 0u);
190 return __longlong_as_double(
a <
b ? 0xffffffffffffffffull : 0ull);
194 return __int_as_float(
a <=
b ? 0xffffffffu : 0u);
198 return __longlong_as_double(
a <=
b ? 0xffffffffffffffffull : 0ull);
203 return make_float4(bitwise_and(
a.x,
b.x), bitwise_and(
a.y,
b.y), bitwise_and(
a.z,
b.z), bitwise_and(
a.w,
b.w));
207 return make_double2(bitwise_and(
a.x,
b.x), bitwise_and(
a.y,
b.y));
212 return make_float4(bitwise_or(
a.x,
b.x), bitwise_or(
a.y,
b.y), bitwise_or(
a.z,
b.z), bitwise_or(
a.w,
b.w));
216 return make_double2(bitwise_or(
a.x,
b.x), bitwise_or(
a.y,
b.y));
221 return make_float4(bitwise_xor(
a.x,
b.x), bitwise_xor(
a.y,
b.y), bitwise_xor(
a.z,
b.z), bitwise_xor(
a.w,
b.w));
225 return make_double2(bitwise_xor(
a.x,
b.x), bitwise_xor(
a.y,
b.y));
230 return make_float4(bitwise_andnot(
a.x,
b.x), bitwise_andnot(
a.y,
b.y), bitwise_andnot(
a.z,
b.z),
231 bitwise_andnot(
a.w,
b.w));
235 return make_double2(bitwise_andnot(
a.x,
b.x), bitwise_andnot(
a.y,
b.y));
240 return make_float4(eq_mask(
a.x,
b.x), eq_mask(
a.y,
b.y), eq_mask(
a.z,
b.z), eq_mask(
a.w,
b.w));
244 return make_float4(lt_mask(
a.x,
b.x), lt_mask(
a.y,
b.y), lt_mask(
a.z,
b.z), lt_mask(
a.w,
b.w));
248 return make_float4(le_mask(
a.x,
b.x), le_mask(
a.y,
b.y), le_mask(
a.z,
b.z), le_mask(
a.w,
b.w));
252 return make_double2(eq_mask(
a.x,
b.x), eq_mask(
a.y,
b.y));
256 return make_double2(lt_mask(
a.x,
b.x), lt_mask(
a.y,
b.y));
260 return make_double2(le_mask(
a.x,
b.x), le_mask(
a.y,
b.y));
267 return make_float4(
a,
a + 1,
a + 2,
a + 3);
271 return make_double2(
a,
a + 1);
276 return make_float4(
a.x +
b.x,
a.y +
b.y,
a.z +
b.z,
a.w +
b.w);
280 return make_double2(
a.x +
b.x,
a.y +
b.y);
285 return make_float4(
a.x -
b.x,
a.y -
b.y,
a.z -
b.z,
a.w -
b.w);
289 return make_double2(
a.x -
b.x,
a.y -
b.y);
294 return make_float4(-
a.x, -
a.y, -
a.z, -
a.w);
298 return make_double2(-
a.x, -
a.y);
312 return make_float4(
a.x *
b.x,
a.y *
b.y,
a.z *
b.z,
a.w *
b.w);
316 return make_double2(
a.x *
b.x,
a.y *
b.y);
321 return make_float4(
a.x /
b.x,
a.y /
b.y,
a.z /
b.z,
a.w /
b.w);
325 return make_double2(
a.x /
b.x,
a.y /
b.y);
330 return make_float4(fminf(
a.x,
b.x), fminf(
a.y,
b.y), fminf(
a.z,
b.z), fminf(
a.w,
b.w));
339 return make_float4(fmaxf(
a.x,
b.x), fmaxf(
a.y,
b.y), fmaxf(
a.z,
b.z), fmaxf(
a.w,
b.w));
348 return *
reinterpret_cast<const float4*
>(from);
353 return *
reinterpret_cast<const double2*
>(from);
358 return make_float4(from[0], from[1], from[2], from[3]);
362 return make_double2(from[0], from[1]);
367 return make_float4(from[0], from[0], from[1], from[1]);
371 return make_double2(from[0], from[0]);
376 *
reinterpret_cast<float4*
>(to) = from;
381 *
reinterpret_cast<double2*
>(to) = from;
400 #if defined(EIGEN_GPU_HAS_LDG)
401 return __ldg(
reinterpret_cast<const float4*
>(from));
403 return make_float4(from[0], from[1], from[2], from[3]);
408 #if defined(EIGEN_GPU_HAS_LDG)
409 return __ldg(
reinterpret_cast<const double2*
>(from));
411 return make_double2(from[0], from[1]);
417 #if defined(EIGEN_GPU_HAS_LDG)
418 return make_float4(__ldg(from + 0), __ldg(from + 1), __ldg(from + 2), __ldg(from + 3));
420 return make_float4(from[0], from[1], from[2], from[3]);
425 #if defined(EIGEN_GPU_HAS_LDG)
426 return make_double2(__ldg(from + 0), __ldg(from + 1));
428 return make_double2(from[0], from[1]);
434 return make_float4(from[0 * stride], from[1 * stride], from[2 * stride], from[3 * stride]);
439 return make_double2(from[0 * stride], from[1 * stride]);
444 to[stride * 0] = from.x;
445 to[stride * 1] = from.y;
446 to[stride * 2] = from.z;
447 to[stride * 3] = from.w;
451 to[stride * 0] = from.x;
452 to[stride * 1] = from.y;
466 return a.x +
a.y +
a.z +
a.w;
475 return fmaxf(fmaxf(
a.x,
a.y), fmaxf(
a.z,
a.w));
484 return fminf(fminf(
a.x,
a.y), fminf(
a.z,
a.w));
493 return a.x *
a.y *
a.z *
a.w;
502 return make_float4(fabsf(
a.x), fabsf(
a.y), fabsf(
a.z), fabsf(
a.w));
511 return make_float4(floorf(
a.x), floorf(
a.y), floorf(
a.z), floorf(
a.w));
520 return make_float4(ceilf(
a.x), ceilf(
a.y), ceilf(
a.z), ceilf(
a.w));
529 return make_float4(rintf(
a.x), rintf(
a.y), rintf(
a.z), rintf(
a.w));
538 return make_float4(truncf(
a.x), truncf(
a.y), truncf(
a.z), truncf(
a.w));
547 kernel.packet[0].y = kernel.packet[1].x;
548 kernel.packet[1].x =
tmp;
551 kernel.packet[0].z = kernel.packet[2].x;
552 kernel.packet[2].x =
tmp;
555 kernel.packet[0].w = kernel.packet[3].x;
556 kernel.packet[3].x =
tmp;
559 kernel.packet[1].z = kernel.packet[2].y;
560 kernel.packet[2].y =
tmp;
563 kernel.packet[1].w = kernel.packet[3].y;
564 kernel.packet[3].y =
tmp;
567 kernel.packet[2].w = kernel.packet[3].z;
568 kernel.packet[3].z =
tmp;
573 kernel.packet[0].y = kernel.packet[1].x;
574 kernel.packet[1].x =
tmp;
582 #if (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)
584 typedef ulonglong2 Packet4h2;
586 struct unpacket_traits<Packet4h2> {
595 typedef Packet4h2
half;
598 struct is_arithmetic<Packet4h2> {
599 enum {
value =
true };
603 struct unpacket_traits<half2> {
615 struct is_arithmetic<half2> {
616 enum {
value =
true };
620 struct packet_traits<
Eigen::half> : default_packet_traits {
621 typedef Packet4h2
type;
622 typedef Packet4h2
half;
642 return __half2half2(from);
648 half2* p_alias =
reinterpret_cast<half2*
>(&
r);
649 p_alias[0] = pset1<half2>(from);
650 p_alias[1] = pset1<half2>(from);
651 p_alias[2] = pset1<half2>(from);
652 p_alias[3] = pset1<half2>(from);
659 return *
reinterpret_cast<const half2*
>(from);
665 return __halves2half2(from[0], from[0]);
669 *
reinterpret_cast<half2*
>(to) = from;
673 to[0] = __low2half(from);
674 to[1] = __high2half(from);
678 #if defined(EIGEN_GPU_HAS_LDG)
680 return __ldg(
reinterpret_cast<const half2*
>(from));
682 return __halves2half2(*(from + 0), *(from + 1));
687 #if defined(EIGEN_GPU_HAS_LDG)
688 return __halves2half2(__ldg(from + 0), __ldg(from + 1));
690 return __halves2half2(*(from + 0), *(from + 1));
695 return __halves2half2(from[0 * stride], from[1 * stride]);
699 to[stride * 0] = __low2half(from);
700 to[stride * 1] = __high2half(from);
706 half a1 = __low2half(
a);
707 half a2 = __high2half(
a);
710 return __halves2half2(result1, result2);
715 return pset1<half2>(true_half);
720 return pset1<half2>(false_half);
724 __half a1 = __low2half(kernel.packet[0]);
725 __half a2 = __high2half(kernel.packet[0]);
726 __half b1 = __low2half(kernel.packet[1]);
727 __half b2 = __high2half(kernel.packet[1]);
728 kernel.packet[0] = __halves2half2(a1, b1);
729 kernel.packet[1] = __halves2half2(a2, b2);
733 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
734 return __halves2half2(
a, __hadd(
a, __float2half(1.0f)));
736 float f = __half2float(
a) + 1.0f;
737 return __halves2half2(
a, __float2half(
f));
742 half mask_low = __low2half(mask);
743 half mask_high = __high2half(mask);
744 half result_low = mask_low == half(0) ? __low2half(
b) : __low2half(
a);
745 half result_high = mask_high == half(0) ? __high2half(
b) : __high2half(
a);
746 return __halves2half2(result_low, result_high);
752 half a1 = __low2half(
a);
753 half a2 = __high2half(
a);
754 half b1 = __low2half(
b);
755 half b2 = __high2half(
b);
756 half eq1 = __half2float(a1) == __half2float(b1) ? true_half : false_half;
757 half eq2 = __half2float(a2) == __half2float(b2) ? true_half : false_half;
758 return __halves2half2(eq1, eq2);
764 half a1 = __low2half(
a);
765 half a2 = __high2half(
a);
766 half b1 = __low2half(
b);
767 half b2 = __high2half(
b);
768 half eq1 = __half2float(a1) < __half2float(b1) ? true_half : false_half;
769 half eq2 = __half2float(a2) < __half2float(b2) ? true_half : false_half;
770 return __halves2half2(eq1, eq2);
776 half a1 = __low2half(
a);
777 half a2 = __high2half(
a);
778 half b1 = __low2half(
b);
779 half b2 = __high2half(
b);
780 half eq1 = __half2float(a1) <= __half2float(b1) ? true_half : false_half;
781 half eq2 = __half2float(a2) <= __half2float(b2) ? true_half : false_half;
782 return __halves2half2(eq1, eq2);
786 half a1 = __low2half(
a);
787 half a2 = __high2half(
a);
788 half b1 = __low2half(
b);
789 half b2 = __high2half(
b);
792 return __halves2half2(result1, result2);
796 half a1 = __low2half(
a);
797 half a2 = __high2half(
a);
798 half b1 = __low2half(
b);
799 half b2 = __high2half(
b);
802 return __halves2half2(result1, result2);
806 half a1 = __low2half(
a);
807 half a2 = __high2half(
a);
808 half b1 = __low2half(
b);
809 half b2 = __high2half(
b);
812 return __halves2half2(result1, result2);
816 half a1 = __low2half(
a);
817 half a2 = __high2half(
a);
818 half b1 = __low2half(
b);
819 half b2 = __high2half(
b);
822 return __halves2half2(result1, result2);
826 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
827 return __hadd2(
a,
b);
829 float a1 = __low2float(
a);
830 float a2 = __high2float(
a);
831 float b1 = __low2float(
b);
832 float b2 = __high2float(
b);
835 return __floats2half2_rn(r1, r2);
840 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
841 return __hsub2(
a,
b);
843 float a1 = __low2float(
a);
844 float a2 = __high2float(
a);
845 float b1 = __low2float(
b);
846 float b2 = __high2float(
b);
849 return __floats2half2_rn(r1, r2);
854 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
857 float a1 = __low2float(
a);
858 float a2 = __high2float(
a);
859 return __floats2half2_rn(-a1, -a2);
866 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
867 return __hmul2(
a,
b);
869 float a1 = __low2float(
a);
870 float a2 = __high2float(
a);
871 float b1 = __low2float(
b);
872 float b2 = __high2float(
b);
875 return __floats2half2_rn(r1, r2);
880 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
881 return __hfma2(
a,
b,
c);
883 float a1 = __low2float(
a);
884 float a2 = __high2float(
a);
885 float b1 = __low2float(
b);
886 float b2 = __high2float(
b);
887 float c1 = __low2float(
c);
888 float c2 = __high2float(
c);
889 float r1 = a1 * b1 + c1;
890 float r2 = a2 * b2 + c2;
891 return __floats2half2_rn(r1, r2);
896 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
897 return __h2div(
a,
b);
899 float a1 = __low2float(
a);
900 float a2 = __high2float(
a);
901 float b1 = __low2float(
b);
902 float b2 = __high2float(
b);
905 return __floats2half2_rn(r1, r2);
910 float a1 = __low2float(
a);
911 float a2 = __high2float(
a);
912 float b1 = __low2float(
b);
913 float b2 = __high2float(
b);
914 __half r1 = a1 < b1 ? __low2half(
a) : __low2half(
b);
915 __half r2 = a2 < b2 ? __high2half(
a) : __high2half(
b);
916 return __halves2half2(r1, r2);
920 float a1 = __low2float(
a);
921 float a2 = __high2float(
a);
922 float b1 = __low2float(
b);
923 float b2 = __high2float(
b);
924 __half r1 = a1 > b1 ? __low2half(
a) : __low2half(
b);
925 __half r2 = a2 > b2 ? __high2half(
a) : __high2half(
b);
926 return __halves2half2(r1, r2);
930 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
931 return __hadd(__low2half(
a), __high2half(
a));
933 float a1 = __low2float(
a);
934 float a2 = __high2float(
a);
940 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
941 __half
first = __low2half(
a);
942 __half
second = __high2half(
a);
945 float a1 = __low2float(
a);
946 float a2 = __high2float(
a);
947 return a1 > a2 ? __low2half(
a) : __high2half(
a);
952 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
953 __half
first = __low2half(
a);
954 __half
second = __high2half(
a);
957 float a1 = __low2float(
a);
958 float a2 = __high2float(
a);
959 return a1 < a2 ? __low2half(
a) : __high2half(
a);
964 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
965 return __hmul(__low2half(
a), __high2half(
a));
967 float a1 = __low2float(
a);
968 float a2 = __high2float(
a);
974 float a1 = __low2float(
a);
975 float a2 = __high2float(
a);
976 float r1 = log1pf(a1);
977 float r2 = log1pf(a2);
978 return __floats2half2_rn(r1, r2);
982 float a1 = __low2float(
a);
983 float a2 = __high2float(
a);
984 float r1 = expm1f(a1);
985 float r2 = expm1f(a2);
986 return __floats2half2_rn(r1, r2);
989 #if (EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)) || defined(EIGEN_HIP_DEVICE_COMPILE)
1002 float a1 = __low2float(
a);
1003 float a2 = __high2float(
a);
1004 float r1 = logf(a1);
1005 float r2 = logf(a2);
1006 return __floats2half2_rn(r1, r2);
1010 float a1 = __low2float(
a);
1011 float a2 = __high2float(
a);
1012 float r1 = expf(a1);
1013 float r2 = expf(a2);
1014 return __floats2half2_rn(r1, r2);
1018 float a1 = __low2float(
a);
1019 float a2 = __high2float(
a);
1020 float r1 = sqrtf(a1);
1021 float r2 = sqrtf(a2);
1022 return __floats2half2_rn(r1, r2);
1026 float a1 = __low2float(
a);
1027 float a2 = __high2float(
a);
1028 float r1 = rsqrtf(a1);
1029 float r2 = rsqrtf(a2);
1030 return __floats2half2_rn(r1, r2);
1037 return *
reinterpret_cast<const Packet4h2*
>(from);
1044 half2* p_alias =
reinterpret_cast<half2*
>(&
r);
1045 p_alias[0] =
ploadu(from + 0);
1046 p_alias[1] =
ploadu(from + 2);
1047 p_alias[2] =
ploadu(from + 4);
1048 p_alias[3] =
ploadu(from + 6);
1055 half2* p_alias =
reinterpret_cast<half2*
>(&
r);
1065 *
reinterpret_cast<Packet4h2*
>(to) = from;
1070 const half2* from_alias =
reinterpret_cast<const half2*
>(&from);
1071 pstoreu(to + 0, from_alias[0]);
1072 pstoreu(to + 2, from_alias[1]);
1073 pstoreu(to + 4, from_alias[2]);
1074 pstoreu(to + 6, from_alias[3]);
1079 #if defined(EIGEN_GPU_HAS_LDG)
1081 r = __ldg(
reinterpret_cast<const Packet4h2*
>(from));
1085 half2* r_alias =
reinterpret_cast<half2*
>(&
r);
1086 r_alias[0] = ploadt_ro_aligned(from + 0);
1087 r_alias[1] = ploadt_ro_aligned(from + 2);
1088 r_alias[2] = ploadt_ro_aligned(from + 4);
1089 r_alias[3] = ploadt_ro_aligned(from + 6);
1097 half2* r_alias =
reinterpret_cast<half2*
>(&
r);
1098 r_alias[0] = ploadt_ro_unaligned(from + 0);
1099 r_alias[1] = ploadt_ro_unaligned(from + 2);
1100 r_alias[2] = ploadt_ro_unaligned(from + 4);
1101 r_alias[3] = ploadt_ro_unaligned(from + 6);
1108 half2* p_alias =
reinterpret_cast<half2*
>(&
r);
1109 p_alias[0] = __halves2half2(from[0 * stride], from[1 * stride]);
1110 p_alias[1] = __halves2half2(from[2 * stride], from[3 * stride]);
1111 p_alias[2] = __halves2half2(from[4 * stride], from[5 * stride]);
1112 p_alias[3] = __halves2half2(from[6 * stride], from[7 * stride]);
1119 const half2* from_alias =
reinterpret_cast<const half2*
>(&from);
1120 pscatter(to + stride * 0, from_alias[0], stride);
1121 pscatter(to + stride * 2, from_alias[1], stride);
1122 pscatter(to + stride * 4, from_alias[2], stride);
1123 pscatter(to + stride * 6, from_alias[3], stride);
1128 return pfirst(*(
reinterpret_cast<const half2*
>(&
a)));
1134 half2* p_alias =
reinterpret_cast<half2*
>(&
r);
1135 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1136 p_alias[0] =
pabs(a_alias[0]);
1137 p_alias[1] =
pabs(a_alias[1]);
1138 p_alias[2] =
pabs(a_alias[2]);
1139 p_alias[3] =
pabs(a_alias[3]);
1146 return pset1<Packet4h2>(true_half);
1152 return pset1<Packet4h2>(false_half);
1156 double* d_row3,
double* d_row4,
double* d_row5,
1157 double* d_row6,
double* d_row7) {
1160 d_row0[1] = d_row4[0];
1164 d_row1[1] = d_row5[0];
1168 d_row2[1] = d_row6[0];
1172 d_row3[1] = d_row7[0];
1180 f_row0[1] = f_row2[0];
1184 f_row1[1] = f_row3[0];
1189 __half a1 = __low2half(f0);
1190 __half a2 = __high2half(f0);
1191 __half b1 = __low2half(
f1);
1192 __half b2 = __high2half(
f1);
1193 f0 = __halves2half2(a1, b1);
1194 f1 = __halves2half2(a2, b2);
1198 double* d_row0 =
reinterpret_cast<double*
>(&kernel.packet[0]);
1199 double* d_row1 =
reinterpret_cast<double*
>(&kernel.packet[1]);
1200 double* d_row2 =
reinterpret_cast<double*
>(&kernel.packet[2]);
1201 double* d_row3 =
reinterpret_cast<double*
>(&kernel.packet[3]);
1202 double* d_row4 =
reinterpret_cast<double*
>(&kernel.packet[4]);
1203 double* d_row5 =
reinterpret_cast<double*
>(&kernel.packet[5]);
1204 double* d_row6 =
reinterpret_cast<double*
>(&kernel.packet[6]);
1205 double* d_row7 =
reinterpret_cast<double*
>(&kernel.packet[7]);
1206 ptranspose_double(d_row0, d_row1, d_row2, d_row3, d_row4, d_row5, d_row6, d_row7);
1208 half2* f_row0 =
reinterpret_cast<half2*
>(d_row0);
1209 half2* f_row1 =
reinterpret_cast<half2*
>(d_row1);
1210 half2* f_row2 =
reinterpret_cast<half2*
>(d_row2);
1211 half2* f_row3 =
reinterpret_cast<half2*
>(d_row3);
1212 ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
1213 ptranspose_half(f_row0[0], f_row1[0]);
1214 ptranspose_half(f_row0[1], f_row1[1]);
1215 ptranspose_half(f_row2[0], f_row3[0]);
1216 ptranspose_half(f_row2[1], f_row3[1]);
1218 f_row0 =
reinterpret_cast<half2*
>(d_row0 + 1);
1219 f_row1 =
reinterpret_cast<half2*
>(d_row1 + 1);
1220 f_row2 =
reinterpret_cast<half2*
>(d_row2 + 1);
1221 f_row3 =
reinterpret_cast<half2*
>(d_row3 + 1);
1222 ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
1223 ptranspose_half(f_row0[0], f_row1[0]);
1224 ptranspose_half(f_row0[1], f_row1[1]);
1225 ptranspose_half(f_row2[0], f_row3[0]);
1226 ptranspose_half(f_row2[1], f_row3[1]);
1228 f_row0 =
reinterpret_cast<half2*
>(d_row4);
1229 f_row1 =
reinterpret_cast<half2*
>(d_row5);
1230 f_row2 =
reinterpret_cast<half2*
>(d_row6);
1231 f_row3 =
reinterpret_cast<half2*
>(d_row7);
1232 ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
1233 ptranspose_half(f_row0[0], f_row1[0]);
1234 ptranspose_half(f_row0[1], f_row1[1]);
1235 ptranspose_half(f_row2[0], f_row3[0]);
1236 ptranspose_half(f_row2[1], f_row3[1]);
1238 f_row0 =
reinterpret_cast<half2*
>(d_row4 + 1);
1239 f_row1 =
reinterpret_cast<half2*
>(d_row5 + 1);
1240 f_row2 =
reinterpret_cast<half2*
>(d_row6 + 1);
1241 f_row3 =
reinterpret_cast<half2*
>(d_row7 + 1);
1242 ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
1243 ptranspose_half(f_row0[0], f_row1[0]);
1244 ptranspose_half(f_row0[1], f_row1[1]);
1245 ptranspose_half(f_row2[0], f_row3[0]);
1246 ptranspose_half(f_row2[1], f_row3[1]);
1251 #if defined(EIGEN_HIP_DEVICE_COMPILE)
1254 half2* p_alias =
reinterpret_cast<half2*
>(&
r);
1255 p_alias[0] = __halves2half2(
a, __hadd(
a, __float2half(1.0f)));
1256 p_alias[1] = __halves2half2(__hadd(
a, __float2half(2.0f)), __hadd(
a, __float2half(3.0f)));
1257 p_alias[2] = __halves2half2(__hadd(
a, __float2half(4.0f)), __hadd(
a, __float2half(5.0f)));
1258 p_alias[3] = __halves2half2(__hadd(
a, __float2half(6.0f)), __hadd(
a, __float2half(7.0f)));
1260 #elif defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
1262 half2* r_alias =
reinterpret_cast<half2*
>(&
r);
1264 half2
b = pset1<half2>(
a);
1266 half2 half_offset0 = __halves2half2(__float2half(0.0f), __float2half(2.0f));
1267 half2 half_offset1 = __halves2half2(__float2half(4.0f), __float2half(6.0f));
1269 c = __hadd2(
b, half_offset0);
1270 r_alias[0] =
plset(__low2half(
c));
1271 r_alias[1] =
plset(__high2half(
c));
1273 c = __hadd2(
b, half_offset1);
1274 r_alias[2] =
plset(__low2half(
c));
1275 r_alias[3] =
plset(__high2half(
c));
1280 float f = __half2float(
a);
1282 half2* p_alias =
reinterpret_cast<half2*
>(&
r);
1283 p_alias[0] = __halves2half2(
a, __float2half(
f + 1.0f));
1284 p_alias[1] = __halves2half2(__float2half(
f + 2.0f), __float2half(
f + 3.0f));
1285 p_alias[2] = __halves2half2(__float2half(
f + 4.0f), __float2half(
f + 5.0f));
1286 p_alias[3] = __halves2half2(__float2half(
f + 6.0f), __float2half(
f + 7.0f));
1293 const Packet4h2&
b) {
1295 half2* r_alias =
reinterpret_cast<half2*
>(&
r);
1296 const half2* mask_alias =
reinterpret_cast<const half2*
>(&mask);
1297 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1298 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1299 r_alias[0] =
pselect(mask_alias[0], a_alias[0], b_alias[0]);
1300 r_alias[1] =
pselect(mask_alias[1], a_alias[1], b_alias[1]);
1301 r_alias[2] =
pselect(mask_alias[2], a_alias[2], b_alias[2]);
1302 r_alias[3] =
pselect(mask_alias[3], a_alias[3], b_alias[3]);
1309 half2* r_alias =
reinterpret_cast<half2*
>(&
r);
1310 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1311 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1312 r_alias[0] =
pcmp_eq(a_alias[0], b_alias[0]);
1313 r_alias[1] =
pcmp_eq(a_alias[1], b_alias[1]);
1314 r_alias[2] =
pcmp_eq(a_alias[2], b_alias[2]);
1315 r_alias[3] =
pcmp_eq(a_alias[3], b_alias[3]);
1322 half2* r_alias =
reinterpret_cast<half2*
>(&
r);
1323 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1324 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1325 r_alias[0] =
pcmp_lt(a_alias[0], b_alias[0]);
1326 r_alias[1] =
pcmp_lt(a_alias[1], b_alias[1]);
1327 r_alias[2] =
pcmp_lt(a_alias[2], b_alias[2]);
1328 r_alias[3] =
pcmp_lt(a_alias[3], b_alias[3]);
1335 half2* r_alias =
reinterpret_cast<half2*
>(&
r);
1336 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1337 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1338 r_alias[0] =
pcmp_le(a_alias[0], b_alias[0]);
1339 r_alias[1] =
pcmp_le(a_alias[1], b_alias[1]);
1340 r_alias[2] =
pcmp_le(a_alias[2], b_alias[2]);
1341 r_alias[3] =
pcmp_le(a_alias[3], b_alias[3]);
1348 half2* r_alias =
reinterpret_cast<half2*
>(&
r);
1349 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1350 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1351 r_alias[0] =
pand(a_alias[0], b_alias[0]);
1352 r_alias[1] =
pand(a_alias[1], b_alias[1]);
1353 r_alias[2] =
pand(a_alias[2], b_alias[2]);
1354 r_alias[3] =
pand(a_alias[3], b_alias[3]);
1361 half2* r_alias =
reinterpret_cast<half2*
>(&
r);
1362 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1363 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1364 r_alias[0] =
por(a_alias[0], b_alias[0]);
1365 r_alias[1] =
por(a_alias[1], b_alias[1]);
1366 r_alias[2] =
por(a_alias[2], b_alias[2]);
1367 r_alias[3] =
por(a_alias[3], b_alias[3]);
1374 half2* r_alias =
reinterpret_cast<half2*
>(&
r);
1375 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1376 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1377 r_alias[0] =
pxor(a_alias[0], b_alias[0]);
1378 r_alias[1] =
pxor(a_alias[1], b_alias[1]);
1379 r_alias[2] =
pxor(a_alias[2], b_alias[2]);
1380 r_alias[3] =
pxor(a_alias[3], b_alias[3]);
1387 half2* r_alias =
reinterpret_cast<half2*
>(&
r);
1388 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1389 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1390 r_alias[0] =
pandnot(a_alias[0], b_alias[0]);
1391 r_alias[1] =
pandnot(a_alias[1], b_alias[1]);
1392 r_alias[2] =
pandnot(a_alias[2], b_alias[2]);
1393 r_alias[3] =
pandnot(a_alias[3], b_alias[3]);
1400 half2* r_alias =
reinterpret_cast<half2*
>(&
r);
1401 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1402 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1403 r_alias[0] =
padd(a_alias[0], b_alias[0]);
1404 r_alias[1] =
padd(a_alias[1], b_alias[1]);
1405 r_alias[2] =
padd(a_alias[2], b_alias[2]);
1406 r_alias[3] =
padd(a_alias[3], b_alias[3]);
1413 half2* r_alias =
reinterpret_cast<half2*
>(&
r);
1414 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1415 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1416 r_alias[0] =
psub(a_alias[0], b_alias[0]);
1417 r_alias[1] =
psub(a_alias[1], b_alias[1]);
1418 r_alias[2] =
psub(a_alias[2], b_alias[2]);
1419 r_alias[3] =
psub(a_alias[3], b_alias[3]);
1426 half2* r_alias =
reinterpret_cast<half2*
>(&
r);
1427 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1428 r_alias[0] =
pnegate(a_alias[0]);
1429 r_alias[1] =
pnegate(a_alias[1]);
1430 r_alias[2] =
pnegate(a_alias[2]);
1431 r_alias[3] =
pnegate(a_alias[3]);
1443 half2* r_alias =
reinterpret_cast<half2*
>(&
r);
1444 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1445 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1446 r_alias[0] =
pmul(a_alias[0], b_alias[0]);
1447 r_alias[1] =
pmul(a_alias[1], b_alias[1]);
1448 r_alias[2] =
pmul(a_alias[2], b_alias[2]);
1449 r_alias[3] =
pmul(a_alias[3], b_alias[3]);
1455 const Packet4h2&
c) {
1457 half2* r_alias =
reinterpret_cast<half2*
>(&
r);
1458 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1459 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1460 const half2* c_alias =
reinterpret_cast<const half2*
>(&
c);
1461 r_alias[0] =
pmadd(a_alias[0], b_alias[0], c_alias[0]);
1462 r_alias[1] =
pmadd(a_alias[1], b_alias[1], c_alias[1]);
1463 r_alias[2] =
pmadd(a_alias[2], b_alias[2], c_alias[2]);
1464 r_alias[3] =
pmadd(a_alias[3], b_alias[3], c_alias[3]);
1471 half2* r_alias =
reinterpret_cast<half2*
>(&
r);
1472 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1473 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1474 r_alias[0] =
pdiv(a_alias[0], b_alias[0]);
1475 r_alias[1] =
pdiv(a_alias[1], b_alias[1]);
1476 r_alias[2] =
pdiv(a_alias[2], b_alias[2]);
1477 r_alias[3] =
pdiv(a_alias[3], b_alias[3]);
1484 half2* r_alias =
reinterpret_cast<half2*
>(&
r);
1485 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1486 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1487 r_alias[0] =
pmin(a_alias[0], b_alias[0]);
1488 r_alias[1] =
pmin(a_alias[1], b_alias[1]);
1489 r_alias[2] =
pmin(a_alias[2], b_alias[2]);
1490 r_alias[3] =
pmin(a_alias[3], b_alias[3]);
1497 half2* r_alias =
reinterpret_cast<half2*
>(&
r);
1498 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1499 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1500 r_alias[0] =
pmax(a_alias[0], b_alias[0]);
1501 r_alias[1] =
pmax(a_alias[1], b_alias[1]);
1502 r_alias[2] =
pmax(a_alias[2], b_alias[2]);
1503 r_alias[3] =
pmax(a_alias[3], b_alias[3]);
1509 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1516 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1521 #if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
1522 return (__hgt(first, second) ? first : second);
1524 float ffirst = __half2float(first);
1525 float fsecond = __half2float(second);
1532 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1537 #if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
1538 return (__hlt(first, second) ? first : second);
1540 float ffirst = __half2float(first);
1541 float fsecond = __half2float(second);
1549 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1556 half2* r_alias =
reinterpret_cast<half2*
>(&
r);
1557 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1558 r_alias[0] =
plog1p(a_alias[0]);
1559 r_alias[1] =
plog1p(a_alias[1]);
1560 r_alias[2] =
plog1p(a_alias[2]);
1561 r_alias[3] =
plog1p(a_alias[3]);
1568 half2* r_alias =
reinterpret_cast<half2*
>(&
r);
1569 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1570 r_alias[0] =
pexpm1(a_alias[0]);
1571 r_alias[1] =
pexpm1(a_alias[1]);
1572 r_alias[2] =
pexpm1(a_alias[2]);
1573 r_alias[3] =
pexpm1(a_alias[3]);
1580 half2* r_alias =
reinterpret_cast<half2*
>(&
r);
1581 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1582 r_alias[0] =
plog(a_alias[0]);
1583 r_alias[1] =
plog(a_alias[1]);
1584 r_alias[2] =
plog(a_alias[2]);
1585 r_alias[3] =
plog(a_alias[3]);
1592 half2* r_alias =
reinterpret_cast<half2*
>(&
r);
1593 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1594 r_alias[0] =
pexp(a_alias[0]);
1595 r_alias[1] =
pexp(a_alias[1]);
1596 r_alias[2] =
pexp(a_alias[2]);
1597 r_alias[3] =
pexp(a_alias[3]);
1604 half2* r_alias =
reinterpret_cast<half2*
>(&
r);
1605 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1606 r_alias[0] =
psqrt(a_alias[0]);
1607 r_alias[1] =
psqrt(a_alias[1]);
1608 r_alias[2] =
psqrt(a_alias[2]);
1609 r_alias[3] =
psqrt(a_alias[3]);
1616 half2* r_alias =
reinterpret_cast<half2*
>(&
r);
1617 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1618 r_alias[0] =
prsqrt(a_alias[0]);
1619 r_alias[1] =
prsqrt(a_alias[1]);
1620 r_alias[2] =
prsqrt(a_alias[2]);
1621 r_alias[3] =
prsqrt(a_alias[3]);
1629 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
1630 return __hadd2(
a,
b);
1632 float a1 = __low2float(
a);
1633 float a2 = __high2float(
a);
1634 float b1 = __low2float(
b);
1635 float b2 = __high2float(
b);
1638 return __floats2half2_rn(r1, r2);
1644 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
1645 return __hmul2(
a,
b);
1647 float a1 = __low2float(
a);
1648 float a2 = __high2float(
a);
1649 float b1 = __low2float(
b);
1650 float b2 = __high2float(
b);
1653 return __floats2half2_rn(r1, r2);
1659 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
1660 return __h2div(
a,
b);
1662 float a1 = __low2float(
a);
1663 float a2 = __high2float(
a);
1664 float b1 = __low2float(
b);
1665 float b2 = __high2float(
b);
1668 return __floats2half2_rn(r1, r2);
1674 float a1 = __low2float(
a);
1675 float a2 = __high2float(
a);
1676 float b1 = __low2float(
b);
1677 float b2 = __high2float(
b);
1678 __half r1 = a1 < b1 ? __low2half(
a) : __low2half(
b);
1679 __half r2 = a2 < b2 ? __high2half(
a) : __high2half(
b);
1680 return __halves2half2(r1, r2);
1685 float a1 = __low2float(
a);
1686 float a2 = __high2float(
a);
1687 float b1 = __low2float(
b);
1688 float b2 = __high2float(
b);
1689 __half r1 = a1 > b1 ? __low2half(
a) : __low2half(
b);
1690 __half r2 = a2 > b2 ? __high2half(
a) : __high2half(
b);
1691 return __halves2half2(r1, r2);
1696 #undef EIGEN_GPU_HAS_LDG
1697 #undef EIGEN_CUDA_HAS_FP16_ARITHMETIC
1698 #undef EIGEN_GPU_HAS_FP16_ARITHMETIC
#define EIGEN_ALWAYS_INLINE
Definition: Macros.h:845
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:892
#define EIGEN_STRONG_INLINE
Definition: Macros.h:834
Scalar * b
Definition: benchVecAdd.cpp:17
EIGEN_STRONG_INLINE PacketScalar packet(Index rowId, Index colId) const
Definition: PlainObjectBase.h:247
static int f(const TensorMap< Tensor< int, 3 > > &tensor)
Definition: cxx11_tensor_map.cpp:237
@ Aligned16
Definition: Constants.h:237
const Scalar * a
Definition: level2_cplx_impl.h:32
Eigen::Matrix< Scalar, Dynamic, Dynamic, ColMajor > tmp
Definition: level3_impl.h:365
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 floor(const bfloat16 &a)
Definition: BFloat16.h:643
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 ceil(const bfloat16 &a)
Definition: BFloat16.h:644
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 trunc(const bfloat16 &a)
Definition: BFloat16.h:647
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 rint(const bfloat16 &a)
Definition: BFloat16.h:645
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 fmin(const bfloat16 &a, const bfloat16 &b)
Definition: BFloat16.h:664
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 fmax(const bfloat16 &a, const bfloat16 &b)
Definition: BFloat16.h:670
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x)
Definition: Half.h:496
EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexpm1(const Packet &a)
Definition: GenericPacketMath.h:1097
EIGEN_STRONG_INLINE void pstoreu< double >(double *to, const Packet4d &from)
Definition: AVX/PacketMath.h:1628
EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf &a)
Definition: AltiVec/Complex.h:268
EIGEN_DEVICE_FUNC Packet padd(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:318
EIGEN_STRONG_INLINE Packet8f pzero(const Packet8f &)
Definition: AVX/PacketMath.h:774
EIGEN_STRONG_INLINE void ptranspose(PacketBlock< Packet2cf, 2 > &kernel)
Definition: AltiVec/Complex.h:339
EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog(const Packet &a)
Definition: GenericPacketMath.h:1103
EIGEN_DEVICE_FUNC Packet pdiv(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:368
EIGEN_DEVICE_FUNC unpacket_traits< Packet >::type predux_max(const Packet &a)
Definition: GenericPacketMath.h:1258
EIGEN_DEVICE_FUNC Packet ploadu(const typename unpacket_traits< Packet >::type *from)
Definition: GenericPacketMath.h:775
EIGEN_DEVICE_FUNC Packet pmax(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:663
EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f &a, const Packet4f &b)
Definition: AltiVec/PacketMath.h:1314
EIGEN_DEVICE_FUNC unpacket_traits< Packet >::type predux_min(const Packet &a)
Definition: GenericPacketMath.h:1245
EIGEN_STRONG_INLINE Packet8h por(const Packet8h &a, const Packet8h &b)
Definition: AVX/PacketMath.h:2309
EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i &a, const Packet4i &b)
Definition: AltiVec/PacketMath.h:1341
EIGEN_STRONG_INLINE void pstore< double >(double *to, const Packet4d &from)
Definition: AVX/PacketMath.h:1611
EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
Definition: AltiVec/PacketMath.h:1218
EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf &a, const Packet4cf &b)
Definition: AVX/Complex.h:88
EIGEN_STRONG_INLINE Packet8h ptrue(const Packet8h &a)
Definition: AVX/PacketMath.h:2263
EIGEN_DEVICE_FUNC Packet pgather(const Packet &src, const Scalar *from, Index stride, typename unpacket_traits< Packet >::mask_t umask)
EIGEN_DEVICE_FUNC Packet pmin(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:649
EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h &a, const Packet8h &b)
Definition: AVX/PacketMath.h:2323
EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf &a)
Definition: AltiVec/Complex.h:264
EIGEN_STRONG_INLINE void pstore< float >(float *to, const Packet4f &from)
Definition: AltiVec/PacketMath.h:642
EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f &a)
Definition: AltiVec/PacketMath.h:1936
EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog1p(const Packet &a)
Definition: GenericPacketMath.h:1110
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet ploaddup(const typename unpacket_traits< Packet >::type *from)
Definition: GenericPacketMath.h:824
EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet8bf &a)
Definition: AltiVec/PacketMath.h:2418
EIGEN_DEVICE_FUNC void pscatter(Scalar *to, const Packet &from, Index stride, typename unpacket_traits< Packet >::mask_t umask)
EIGEN_DEVICE_FUNC void pstore(Scalar *to, const Packet &from)
Definition: GenericPacketMath.h:891
EIGEN_DEVICE_FUNC unpacket_traits< Packet >::type predux(const Packet &a)
Definition: GenericPacketMath.h:1232
EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f &a)
Definition: LSX/PacketMath.h:2176
EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf &a, const Packet2cf &b)
Definition: AltiVec/Complex.h:353
EIGEN_DEVICE_FUNC void pstoreu(Scalar *to, const Packet &from)
Definition: GenericPacketMath.h:911
EIGEN_STRONG_INLINE Packet8h pand(const Packet8h &a, const Packet8h &b)
Definition: AVX/PacketMath.h:2319
EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h &a, const Packet8h &b)
Definition: AVX/PacketMath.h:2315
EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f &mask, const Packet4f &a, const Packet4f &b)
Definition: AltiVec/PacketMath.h:1474
EIGEN_DEVICE_FUNC Packet psub(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:337
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet plset(const typename unpacket_traits< Packet >::type &a)
Returns a packet with coefficients (a,a+1,...,a+packet_size-1).
Definition: GenericPacketMath.h:872
EIGEN_DEVICE_FUNC Packet pload(const typename unpacket_traits< Packet >::type *from)
Definition: GenericPacketMath.h:752
EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f &a)
Definition: LSX/PacketMath.h:2528
EIGEN_STRONG_INLINE void pstoreu< float >(float *to, const Packet4f &from)
Definition: AltiVec/PacketMath.h:1756
EIGEN_STRONG_INLINE Packet4f pexp(const Packet4f &_x)
Definition: LSX/PacketMath.h:2663
EIGEN_DEVICE_FUNC unpacket_traits< Packet >::type predux_mul(const Packet &a)
Definition: GenericPacketMath.h:1238
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:83
double f1(const Vector< double > &coord)
f1 function, in front of the C1 unknown
Definition: poisson/poisson_with_singularity/two_d_poisson.cc:147
Real fabs(const Real &a)
Definition: boostmultiprec.cpp:117
int c
Definition: calibrate.py:100
Definition: Eigen_Colamd.h:49
@ HasZeta
Definition: GenericPacketMath.h:93
@ HasIGamma
Definition: GenericPacketMath.h:99
@ HasRsqrt
Definition: GenericPacketMath.h:74
@ HasSin
Definition: GenericPacketMath.h:81
@ HasBlend
Definition: GenericPacketMath.h:66
@ HasErfc
Definition: GenericPacketMath.h:96
@ HasNdtri
Definition: GenericPacketMath.h:97
@ HasCos
Definition: GenericPacketMath.h:82
@ HasPolygamma
Definition: GenericPacketMath.h:94
@ HasDiGamma
Definition: GenericPacketMath.h:92
@ HasLog1p
Definition: GenericPacketMath.h:78
@ HasIGammac
Definition: GenericPacketMath.h:102
@ HasExp
Definition: GenericPacketMath.h:75
@ HasBetaInc
Definition: GenericPacketMath.h:103
@ HasSqrt
Definition: GenericPacketMath.h:73
@ HasLGamma
Definition: GenericPacketMath.h:91
@ HasErf
Definition: GenericPacketMath.h:95
@ HasBessel
Definition: GenericPacketMath.h:98
@ HasExpm1
Definition: GenericPacketMath.h:76
@ HasLog
Definition: GenericPacketMath.h:77
@ HasGammaSampleDerAlpha
Definition: GenericPacketMath.h:101
@ HasIGammaDerA
Definition: GenericPacketMath.h:100
@ HasDiv
Definition: GenericPacketMath.h:71
@ value
Definition: Meta.h:146
T type
Definition: GenericPacketMath.h:109
@ size
Definition: GenericPacketMath.h:113
@ AlignedOnScalar
Definition: GenericPacketMath.h:114
@ Vectorizable
Definition: GenericPacketMath.h:112
T half
Definition: GenericPacketMath.h:110
@ HasSub
Definition: GenericPacketMath.h:118
@ HasMul
Definition: GenericPacketMath.h:119
@ HasAdd
Definition: GenericPacketMath.h:117
T type
Definition: GenericPacketMath.h:135
T half
Definition: GenericPacketMath.h:136
@ masked_load_available
Definition: GenericPacketMath.h:142
@ size
Definition: GenericPacketMath.h:139
@ masked_store_available
Definition: GenericPacketMath.h:143
@ vectorizable
Definition: GenericPacketMath.h:141
@ alignment
Definition: GenericPacketMath.h:140