NEON/PacketMath.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
5 // Copyright (C) 2010 Konstantinos Margaritis <markos@freevec.org>
6 // Heavily based on Gael's SSE version.
7 //
8 // This Source Code Form is subject to the terms of the Mozilla
9 // Public License v. 2.0. If a copy of the MPL was not distributed
10 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
11 
12 #ifndef EIGEN_PACKET_MATH_NEON_H
13 #define EIGEN_PACKET_MATH_NEON_H
14 
15 // IWYU pragma: private
16 #include "../../InternalHeaderCheck.h"
17 
18 namespace Eigen {
19 
20 namespace internal {
21 
22 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
23 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
24 #endif
25 
26 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
27 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
28 #endif
29 
30 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
31 #if EIGEN_ARCH_ARM64
32 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
33 #else
34 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
35 #endif
36 #endif
37 
38 #if EIGEN_COMP_MSVC_STRICT
39 
40 // In MSVC's arm_neon.h header file, all NEON vector types
41 // are aliases to the same underlying type __n128.
42 // We thus have to wrap them to make them different C++ types.
43 // (See also bug 1428)
44 typedef eigen_packet_wrapper<float32x2_t, 0> Packet2f;
45 typedef eigen_packet_wrapper<float32x4_t, 1> Packet4f;
46 typedef eigen_packet_wrapper<int32_t, 2> Packet4c;
47 typedef eigen_packet_wrapper<int8x8_t, 3> Packet8c;
48 typedef eigen_packet_wrapper<int8x16_t, 4> Packet16c;
49 typedef eigen_packet_wrapper<uint32_t, 5> Packet4uc;
50 typedef eigen_packet_wrapper<uint8x8_t, 6> Packet8uc;
51 typedef eigen_packet_wrapper<uint8x16_t, 7> Packet16uc;
52 typedef eigen_packet_wrapper<int16x4_t, 8> Packet4s;
53 typedef eigen_packet_wrapper<int16x8_t, 9> Packet8s;
54 typedef eigen_packet_wrapper<uint16x4_t, 10> Packet4us;
55 typedef eigen_packet_wrapper<uint16x8_t, 11> Packet8us;
56 typedef eigen_packet_wrapper<int32x2_t, 12> Packet2i;
57 typedef eigen_packet_wrapper<int32x4_t, 13> Packet4i;
58 typedef eigen_packet_wrapper<uint32x2_t, 14> Packet2ui;
59 typedef eigen_packet_wrapper<uint32x4_t, 15> Packet4ui;
60 typedef eigen_packet_wrapper<int64x2_t, 16> Packet2l;
61 typedef eigen_packet_wrapper<uint64x2_t, 17> Packet2ul;
62 
63 EIGEN_ALWAYS_INLINE Packet4f make_packet4f(float a, float b, float c, float d) {
64  float from[4] = {a, b, c, d};
65  return vld1q_f32(from);
66 }
67 
69  float from[2] = {a, b};
70  return vld1_f32(from);
71 }
72 
73 #else
74 
75 typedef float32x2_t Packet2f;
76 typedef float32x4_t Packet4f;
78 typedef int8x8_t Packet8c;
79 typedef int8x16_t Packet16c;
81 typedef uint8x8_t Packet8uc;
82 typedef uint8x16_t Packet16uc;
83 typedef int16x4_t Packet4s;
84 typedef int16x8_t Packet8s;
85 typedef uint16x4_t Packet4us;
86 typedef uint16x8_t Packet8us;
87 typedef int32x2_t Packet2i;
88 typedef int32x4_t Packet4i;
89 typedef uint32x2_t Packet2ui;
90 typedef uint32x4_t Packet4ui;
91 typedef int64x2_t Packet2l;
92 typedef uint64x2_t Packet2ul;
93 
94 EIGEN_ALWAYS_INLINE Packet4f make_packet4f(float a, float b, float c, float d) { return Packet4f{a, b, c, d}; }
95 EIGEN_ALWAYS_INLINE Packet2f make_packet2f(float a, float b) { return Packet2f{a, b}; }
96 
97 #endif // EIGEN_COMP_MSVC_STRICT
98 
100  const float* a = reinterpret_cast<const float*>(&m);
101  Packet4f res =
102  make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(a + ((mask >> 6) & 3)));
103  return res;
104 }
105 
106 // functionally equivalent to _mm_shuffle_ps in SSE when interleave
107 // == false (i.e. shuffle<false>(m, n, mask) equals _mm_shuffle_ps(m, n, mask)),
108 // interleave m and n when interleave == true. Currently used in LU/arch/InverseSize4.h
109 // to enable a shared implementation for fast inversion of matrices of size 4.
110 template <bool interleave>
111 EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f& m, const Packet4f& n, int mask) {
112  const float* a = reinterpret_cast<const float*>(&m);
113  const float* b = reinterpret_cast<const float*>(&n);
114  Packet4f res =
115  make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
116  return res;
117 }
118 
119 template <>
120 EIGEN_STRONG_INLINE Packet4f shuffle2<true>(const Packet4f& m, const Packet4f& n, int mask) {
121  const float* a = reinterpret_cast<const float*>(&m);
122  const float* b = reinterpret_cast<const float*>(&n);
123  Packet4f res =
124  make_packet4f(*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
125  return res;
126 }
127 
128 EIGEN_STRONG_INLINE static int eigen_neon_shuffle_mask(int p, int q, int r, int s) {
129  return ((s) << 6 | (r) << 4 | (q) << 2 | (p));
130 }
131 
132 EIGEN_STRONG_INLINE Packet4f vec4f_swizzle1(const Packet4f& a, int p, int q, int r, int s) {
133  return shuffle1(a, eigen_neon_shuffle_mask(p, q, r, s));
134 }
135 EIGEN_STRONG_INLINE Packet4f vec4f_swizzle2(const Packet4f& a, const Packet4f& b, int p, int q, int r, int s) {
136  return shuffle2<false>(a, b, eigen_neon_shuffle_mask(p, q, r, s));
137 }
139  return shuffle2<false>(a, b, eigen_neon_shuffle_mask(0, 1, 0, 1));
140 }
142  return shuffle2<false>(b, a, eigen_neon_shuffle_mask(2, 3, 2, 3));
143 }
145  return shuffle2<true>(a, b, eigen_neon_shuffle_mask(0, 0, 1, 1));
146 }
148  return shuffle2<true>(a, b, eigen_neon_shuffle_mask(2, 2, 3, 3));
149 }
150 #define vec4f_duplane(a, p) Packet4f(vdupq_lane_f32(vget_low_f32(a), p))
151 
152 #define EIGEN_DECLARE_CONST_Packet4f(NAME, X) const Packet4f p4f_##NAME = pset1<Packet4f>(X)
153 
154 #define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME, X) \
155  const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int32_t>(X))
156 
157 #define EIGEN_DECLARE_CONST_Packet4i(NAME, X) const Packet4i p4i_##NAME = pset1<Packet4i>(X)
158 
159 #if EIGEN_ARCH_ARM64 && EIGEN_COMP_GNUC
160 // __builtin_prefetch tends to do nothing on ARM64 compilers because the
161 // prefetch instructions there are too detailed for __builtin_prefetch to map
162 // meaningfully to them.
163 #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__("prfm pldl1keep, [%[addr]]\n" ::[addr] "r"(ADDR) :);
164 #elif EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
165 #define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR);
166 #elif defined __pld
167 #define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR)
168 #elif EIGEN_ARCH_ARM
169 #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__("pld [%[addr]]\n" ::[addr] "r"(ADDR) :);
170 #else
171 // by default no explicit prefetching
172 #define EIGEN_ARM_PREFETCH(ADDR)
173 #endif
174 
175 template <>
176 struct packet_traits<float> : default_packet_traits {
177  typedef Packet4f type;
178  typedef Packet2f half;
179  enum {
180  Vectorizable = 1,
181  AlignedOnScalar = 1,
182  size = 4,
183 
184  HasCmp = 1,
185  HasAdd = 1,
186  HasSub = 1,
187  HasShift = 1,
188  HasMul = 1,
189  HasNegate = 1,
190  HasAbs = 1,
191  HasArg = 0,
192  HasAbs2 = 1,
194  HasMin = 1,
195  HasMax = 1,
196  HasConj = 1,
197  HasSetLinear = 1,
198  HasBlend = 0,
199  HasDiv = 1,
202  HasACos = 1,
203  HasASin = 1,
204  HasATan = 1,
205  HasATanh = 1,
206  HasLog = 1,
207  HasExp = 1,
208  HasSqrt = 1,
209  HasRsqrt = 1,
213  HasBessel = 0, // Issues with accuracy.
214  HasNdtri = 0
215  };
216 };
217 
218 template <>
219 struct packet_traits<int8_t> : default_packet_traits {
220  typedef Packet16c type;
221  typedef Packet8c half;
222  enum {
223  Vectorizable = 1,
224  AlignedOnScalar = 1,
225  size = 16,
226 
227  HasCmp = 1,
228  HasAdd = 1,
229  HasSub = 1,
230  HasShift = 1,
231  HasMul = 1,
233  HasAbs = 1,
235  HasArg = 0,
236  HasAbs2 = 1,
237  HasMin = 1,
238  HasMax = 1,
239  HasConj = 1,
240  HasSetLinear = 1,
241  HasBlend = 0
242  };
243 };
244 
245 template <>
246 struct packet_traits<uint8_t> : default_packet_traits {
247  typedef Packet16uc type;
248  typedef Packet8uc half;
249  enum {
250  Vectorizable = 1,
251  AlignedOnScalar = 1,
252  size = 16,
253 
254  HasCmp = 1,
255  HasAdd = 1,
256  HasSub = 1,
257  HasShift = 1,
258  HasMul = 1,
259  HasNegate = 0,
260  HasAbs = 1,
262  HasArg = 0,
263  HasAbs2 = 1,
264  HasMin = 1,
265  HasMax = 1,
266  HasConj = 1,
267  HasSetLinear = 1,
268  HasBlend = 0,
269 
270  HasSqrt = 1
271  };
272 };
273 
274 template <>
275 struct packet_traits<int16_t> : default_packet_traits {
276  typedef Packet8s type;
277  typedef Packet4s half;
278  enum {
279  Vectorizable = 1,
280  AlignedOnScalar = 1,
281  size = 8,
282 
283  HasCmp = 1,
284  HasAdd = 1,
285  HasSub = 1,
286  HasShift = 1,
287  HasMul = 1,
289  HasAbs = 1,
291  HasArg = 0,
292  HasAbs2 = 1,
293  HasMin = 1,
294  HasMax = 1,
295  HasConj = 1,
296  HasSetLinear = 1,
297  HasBlend = 0
298  };
299 };
300 
301 template <>
302 struct packet_traits<uint16_t> : default_packet_traits {
303  typedef Packet8us type;
304  typedef Packet4us half;
305  enum {
306  Vectorizable = 1,
307  AlignedOnScalar = 1,
308  size = 8,
309 
310  HasCmp = 1,
311  HasAdd = 1,
312  HasSub = 1,
313  HasShift = 1,
314  HasMul = 1,
315  HasNegate = 0,
316  HasAbs = 1,
318  HasArg = 0,
319  HasAbs2 = 1,
320  HasMin = 1,
321  HasMax = 1,
322  HasConj = 1,
323  HasSetLinear = 1,
324  HasBlend = 0,
325  HasSqrt = 1
326  };
327 };
328 
329 template <>
330 struct packet_traits<int32_t> : default_packet_traits {
331  typedef Packet4i type;
332  typedef Packet2i half;
333  enum {
334  Vectorizable = 1,
335  AlignedOnScalar = 1,
336  size = 4,
337 
338  HasCmp = 1,
339  HasAdd = 1,
340  HasSub = 1,
341  HasShift = 1,
342  HasMul = 1,
344  HasAbs = 1,
345  HasArg = 0,
346  HasAbs2 = 1,
348  HasMin = 1,
349  HasMax = 1,
350  HasConj = 1,
351  HasSetLinear = 1,
352  HasBlend = 0
353  };
354 };
355 
356 template <>
357 struct packet_traits<uint32_t> : default_packet_traits {
358  typedef Packet4ui type;
359  typedef Packet2ui half;
360  enum {
361  Vectorizable = 1,
362  AlignedOnScalar = 1,
363  size = 4,
364 
365  HasCmp = 1,
366  HasAdd = 1,
367  HasSub = 1,
368  HasShift = 1,
369  HasMul = 1,
370  HasNegate = 0,
371  HasAbs = 1,
372  HasArg = 0,
373  HasAbs2 = 1,
375  HasMin = 1,
376  HasMax = 1,
377  HasConj = 1,
378  HasSetLinear = 1,
379  HasBlend = 0,
380 
381  HasSqrt = 1
382  };
383 };
384 
385 template <>
386 struct packet_traits<int64_t> : default_packet_traits {
387  typedef Packet2l type;
388  typedef Packet2l half;
389  enum {
390  Vectorizable = 1,
391  AlignedOnScalar = 1,
392  size = 2,
393 
394  HasCmp = 1,
395  HasAdd = 1,
396  HasSub = 1,
397  HasShift = 1,
398  HasMul = 1,
400  HasAbs = 1,
401  HasArg = 0,
402  HasAbs2 = 1,
404  HasMin = 1,
405  HasMax = 1,
406  HasConj = 1,
407  HasSetLinear = 1,
408  HasBlend = 0
409  };
410 };
411 
412 template <>
413 struct packet_traits<uint64_t> : default_packet_traits {
414  typedef Packet2ul type;
415  typedef Packet2ul half;
416  enum {
417  Vectorizable = 1,
418  AlignedOnScalar = 1,
419  size = 2,
420 
421  HasCmp = 1,
422  HasAdd = 1,
423  HasSub = 1,
424  HasShift = 1,
425  HasMul = 1,
426  HasNegate = 0,
427  HasAbs = 1,
428  HasArg = 0,
429  HasAbs2 = 1,
431  HasMin = 1,
432  HasMax = 1,
433  HasConj = 1,
434  HasSetLinear = 1,
435  HasBlend = 0
436  };
437 };
438 
439 template <>
441  typedef float type;
442  typedef Packet2f half;
444  enum {
445  size = 2,
447  vectorizable = true,
449  masked_store_available = false
450  };
451 };
452 template <>
453 struct unpacket_traits<Packet4f> {
454  typedef float type;
455  typedef Packet2f half;
457  enum {
458  size = 4,
460  vectorizable = true,
461  masked_load_available = false,
462  masked_store_available = false
463  };
464 };
465 template <>
467  typedef int8_t type;
468  typedef Packet4c half;
469  enum {
470  size = 4,
472  vectorizable = true,
474  masked_store_available = false
475  };
476 };
477 template <>
479  typedef int8_t type;
480  typedef Packet4c half;
481  enum {
482  size = 8,
484  vectorizable = true,
486  masked_store_available = false
487  };
488 };
489 template <>
490 struct unpacket_traits<Packet16c> {
491  typedef int8_t type;
492  typedef Packet8c half;
493  enum {
494  size = 16,
496  vectorizable = true,
497  masked_load_available = false,
498  masked_store_available = false
499  };
500 };
501 template <>
503  typedef uint8_t type;
504  typedef Packet4uc half;
505  enum {
506  size = 4,
508  vectorizable = true,
510  masked_store_available = false
511  };
512 };
513 template <>
515  typedef uint8_t type;
516  typedef Packet4uc half;
517  enum {
518  size = 8,
520  vectorizable = true,
522  masked_store_available = false
523  };
524 };
525 template <>
526 struct unpacket_traits<Packet16uc> {
527  typedef uint8_t type;
528  typedef Packet8uc half;
529  enum {
530  size = 16,
532  vectorizable = true,
533  masked_load_available = false,
534  masked_store_available = false
535  };
536 };
537 template <>
539  typedef int16_t type;
540  typedef Packet4s half;
541  enum {
542  size = 4,
544  vectorizable = true,
546  masked_store_available = false
547  };
548 };
549 template <>
550 struct unpacket_traits<Packet8s> {
551  typedef int16_t type;
552  typedef Packet4s half;
553  enum {
554  size = 8,
556  vectorizable = true,
557  masked_load_available = false,
558  masked_store_available = false
559  };
560 };
561 template <>
563  typedef uint16_t type;
564  typedef Packet4us half;
565  enum {
566  size = 4,
568  vectorizable = true,
570  masked_store_available = false
571  };
572 };
573 template <>
574 struct unpacket_traits<Packet8us> {
575  typedef uint16_t type;
576  typedef Packet4us half;
577  enum {
578  size = 8,
580  vectorizable = true,
581  masked_load_available = false,
582  masked_store_available = false
583  };
584 };
585 template <>
587  typedef int32_t type;
588  typedef Packet2i half;
589  enum {
590  size = 2,
592  vectorizable = true,
594  masked_store_available = false
595  };
596 };
597 template <>
598 struct unpacket_traits<Packet4i> {
599  typedef int32_t type;
600  typedef Packet2i half;
601  enum {
602  size = 4,
604  vectorizable = true,
605  masked_load_available = false,
606  masked_store_available = false
607  };
608 };
609 template <>
611  typedef uint32_t type;
612  typedef Packet2ui half;
613  enum {
614  size = 2,
616  vectorizable = true,
618  masked_store_available = false
619  };
620 };
621 template <>
622 struct unpacket_traits<Packet4ui> {
623  typedef uint32_t type;
624  typedef Packet2ui half;
625  enum {
626  size = 4,
628  vectorizable = true,
629  masked_load_available = false,
630  masked_store_available = false
631  };
632 };
633 template <>
634 struct unpacket_traits<Packet2l> {
635  typedef int64_t type;
636  typedef Packet2l half;
637  enum {
638  size = 2,
640  vectorizable = true,
641  masked_load_available = false,
642  masked_store_available = false
643  };
644 };
645 template <>
646 struct unpacket_traits<Packet2ul> {
647  typedef uint64_t type;
648  typedef Packet2ul half;
649  enum {
650  size = 2,
652  vectorizable = true,
653  masked_load_available = false,
654  masked_store_available = false
655  };
656 };
657 
658 template <>
660  return vdup_n_f32(0.0f);
661 }
662 
663 template <>
665  return vdupq_n_f32(0.0f);
666 }
667 
668 template <>
670  return vdup_n_f32(from);
671 }
672 template <>
673 EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
674  return vdupq_n_f32(from);
675 }
676 template <>
678  return vget_lane_s32(vreinterpret_s32_s8(vdup_n_s8(from)), 0);
679 }
680 template <>
682  return vdup_n_s8(from);
683 }
684 template <>
686  return vdupq_n_s8(from);
687 }
688 template <>
690  return vget_lane_u32(vreinterpret_u32_u8(vdup_n_u8(from)), 0);
691 }
692 template <>
694  return vdup_n_u8(from);
695 }
696 template <>
698  return vdupq_n_u8(from);
699 }
700 template <>
702  return vdup_n_s16(from);
703 }
704 template <>
706  return vdupq_n_s16(from);
707 }
708 template <>
710  return vdup_n_u16(from);
711 }
712 template <>
714  return vdupq_n_u16(from);
715 }
716 template <>
718  return vdup_n_s32(from);
719 }
720 template <>
722  return vdupq_n_s32(from);
723 }
724 template <>
726  return vdup_n_u32(from);
727 }
728 template <>
730  return vdupq_n_u32(from);
731 }
732 template <>
734  return vdupq_n_s64(from);
735 }
736 template <>
738  return vdupq_n_u64(from);
739 }
740 
741 template <>
743  return vreinterpret_f32_u32(vdup_n_u32(from));
744 }
745 template <>
747  return vreinterpretq_f32_u32(vdupq_n_u32(from));
748 }
749 
750 template <>
752  const float c[] = {0.0f, 1.0f};
753  return vadd_f32(pset1<Packet2f>(a), vld1_f32(c));
754 }
755 template <>
757  const float c[] = {0.0f, 1.0f, 2.0f, 3.0f};
758  return vaddq_f32(pset1<Packet4f>(a), vld1q_f32(c));
759 }
760 template <>
762  return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(vreinterpret_s8_u32(vdup_n_u32(0x03020100)), vdup_n_s8(a))), 0);
763 }
764 template <>
766  const int8_t c[] = {0, 1, 2, 3, 4, 5, 6, 7};
767  return vadd_s8(pset1<Packet8c>(a), vld1_s8(c));
768 }
769 template <>
771  const int8_t c[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
772  return vaddq_s8(pset1<Packet16c>(a), vld1q_s8(c));
773 }
774 template <>
776  return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(vreinterpret_u8_u32(vdup_n_u32(0x03020100)), vdup_n_u8(a))), 0);
777 }
778 template <>
780  const uint8_t c[] = {0, 1, 2, 3, 4, 5, 6, 7};
781  return vadd_u8(pset1<Packet8uc>(a), vld1_u8(c));
782 }
783 template <>
785  const uint8_t c[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
786  return vaddq_u8(pset1<Packet16uc>(a), vld1q_u8(c));
787 }
788 template <>
790  const int16_t c[] = {0, 1, 2, 3};
791  return vadd_s16(pset1<Packet4s>(a), vld1_s16(c));
792 }
793 template <>
795  const uint16_t c[] = {0, 1, 2, 3};
796  return vadd_u16(pset1<Packet4us>(a), vld1_u16(c));
797 }
798 template <>
800  const int16_t c[] = {0, 1, 2, 3, 4, 5, 6, 7};
801  return vaddq_s16(pset1<Packet8s>(a), vld1q_s16(c));
802 }
803 template <>
805  const uint16_t c[] = {0, 1, 2, 3, 4, 5, 6, 7};
806  return vaddq_u16(pset1<Packet8us>(a), vld1q_u16(c));
807 }
808 template <>
810  const int32_t c[] = {0, 1};
811  return vadd_s32(pset1<Packet2i>(a), vld1_s32(c));
812 }
813 template <>
815  const int32_t c[] = {0, 1, 2, 3};
816  return vaddq_s32(pset1<Packet4i>(a), vld1q_s32(c));
817 }
818 template <>
820  const uint32_t c[] = {0, 1};
821  return vadd_u32(pset1<Packet2ui>(a), vld1_u32(c));
822 }
823 template <>
825  const uint32_t c[] = {0, 1, 2, 3};
826  return vaddq_u32(pset1<Packet4ui>(a), vld1q_u32(c));
827 }
828 template <>
830  const int64_t c[] = {0, 1};
831  return vaddq_s64(pset1<Packet2l>(a), vld1q_s64(c));
832 }
833 template <>
835  const uint64_t c[] = {0, 1};
836  return vaddq_u64(pset1<Packet2ul>(a), vld1q_u64(c));
837 }
838 
839 template <>
841  return vadd_f32(a, b);
842 }
843 template <>
845  return vaddq_f32(a, b);
846 }
847 template <>
849  return vget_lane_s32(
850  vreinterpret_s32_s8(vadd_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
851 }
852 template <>
854  return vadd_s8(a, b);
855 }
856 template <>
858  return vaddq_s8(a, b);
859 }
860 template <>
862  return vget_lane_u32(
863  vreinterpret_u32_u8(vadd_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
864 }
865 template <>
867  return vadd_u8(a, b);
868 }
869 template <>
871  return vaddq_u8(a, b);
872 }
873 template <>
875  return vadd_s16(a, b);
876 }
877 template <>
879  return vaddq_s16(a, b);
880 }
881 template <>
883  return vadd_u16(a, b);
884 }
885 template <>
887  return vaddq_u16(a, b);
888 }
889 template <>
891  return vadd_s32(a, b);
892 }
893 template <>
895  return vaddq_s32(a, b);
896 }
897 template <>
899  return vadd_u32(a, b);
900 }
901 template <>
903  return vaddq_u32(a, b);
904 }
905 template <>
907  return vaddq_s64(a, b);
908 }
909 template <>
911  return vaddq_u64(a, b);
912 }
913 
914 template <>
916  return vsub_f32(a, b);
917 }
918 template <>
920  return vsubq_f32(a, b);
921 }
922 template <>
924  return vget_lane_s32(
925  vreinterpret_s32_s8(vsub_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
926 }
927 template <>
929  return vsub_s8(a, b);
930 }
931 template <>
933  return vsubq_s8(a, b);
934 }
935 template <>
937  return vget_lane_u32(
938  vreinterpret_u32_u8(vsub_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
939 }
940 template <>
942  return vsub_u8(a, b);
943 }
944 template <>
946  return vsubq_u8(a, b);
947 }
948 template <>
950  return vsub_s16(a, b);
951 }
952 template <>
954  return vsubq_s16(a, b);
955 }
956 template <>
958  return vsub_u16(a, b);
959 }
960 template <>
962  return vsubq_u16(a, b);
963 }
964 template <>
966  return vsub_s32(a, b);
967 }
968 template <>
970  return vsubq_s32(a, b);
971 }
972 template <>
974  return vsub_u32(a, b);
975 }
976 template <>
978  return vsubq_u32(a, b);
979 }
980 template <>
982  return vsubq_s64(a, b);
983 }
984 template <>
986  return vsubq_u64(a, b);
987 }
988 
989 template <>
991 template <>
993  Packet2f mask = make_packet2f(numext::bit_cast<float>(0x80000000u), 0.0f);
994  return padd(a, pxor(mask, b));
995 }
996 template <>
998 template <>
1000  Packet4f mask = make_packet4f(numext::bit_cast<float>(0x80000000u), 0.0f, numext::bit_cast<float>(0x80000000u), 0.0f);
1001  return padd(a, pxor(mask, b));
1002 }
1003 
1004 template <>
1006  return vneg_f32(a);
1007 }
1008 template <>
1010  return vnegq_f32(a);
1011 }
1012 template <>
1014  return vget_lane_s32(vreinterpret_s32_s8(vneg_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0);
1015 }
1016 template <>
1018  return vneg_s8(a);
1019 }
1020 template <>
1022  return vnegq_s8(a);
1023 }
1024 template <>
1026  return vneg_s16(a);
1027 }
1028 template <>
1030  return vnegq_s16(a);
1031 }
1032 template <>
1034  return vneg_s32(a);
1035 }
1036 template <>
1038  return vnegq_s32(a);
1039 }
1040 template <>
1042 #if EIGEN_ARCH_ARM64
1043  return vnegq_s64(a);
1044 #else
1045  return vcombine_s64(vdup_n_s64(-vgetq_lane_s64(a, 0)), vdup_n_s64(-vgetq_lane_s64(a, 1)));
1046 #endif
1047 }
1048 
1049 template <>
1051  return a;
1052 }
1053 template <>
1055  return a;
1056 }
1057 template <>
1059  return a;
1060 }
1061 template <>
1063  return a;
1064 }
1065 template <>
1067  return a;
1068 }
1069 template <>
1071  return a;
1072 }
1073 template <>
1075  return a;
1076 }
1077 template <>
1079  return a;
1080 }
1081 template <>
1083  return a;
1084 }
1085 template <>
1087  return a;
1088 }
1089 template <>
1091  return a;
1092 }
1093 template <>
1095  return a;
1096 }
1097 template <>
1099  return a;
1100 }
1101 template <>
1103  return a;
1104 }
1105 template <>
1107  return a;
1108 }
1109 template <>
1111  return a;
1112 }
1113 template <>
1115  return a;
1116 }
1117 template <>
1119  return a;
1120 }
1121 
1122 template <>
1124  return vmul_f32(a, b);
1125 }
1126 template <>
1128  return vmulq_f32(a, b);
1129 }
1130 template <>
1132  return vget_lane_s32(
1133  vreinterpret_s32_s8(vmul_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1134 }
1135 template <>
1137  return vmul_s8(a, b);
1138 }
1139 template <>
1141  return vmulq_s8(a, b);
1142 }
1143 template <>
1145  return vget_lane_u32(
1146  vreinterpret_u32_u8(vmul_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1147 }
1148 template <>
1150  return vmul_u8(a, b);
1151 }
1152 template <>
1154  return vmulq_u8(a, b);
1155 }
1156 template <>
1158  return vmul_s16(a, b);
1159 }
1160 template <>
1162  return vmulq_s16(a, b);
1163 }
1164 template <>
1166  return vmul_u16(a, b);
1167 }
1168 template <>
1170  return vmulq_u16(a, b);
1171 }
1172 template <>
1174  return vmul_s32(a, b);
1175 }
1176 template <>
1178  return vmulq_s32(a, b);
1179 }
1180 template <>
1182  return vmul_u32(a, b);
1183 }
1184 template <>
1186  return vmulq_u32(a, b);
1187 }
1188 template <>
1190  return vcombine_s64(vdup_n_s64(vgetq_lane_s64(a, 0) * vgetq_lane_s64(b, 0)),
1191  vdup_n_s64(vgetq_lane_s64(a, 1) * vgetq_lane_s64(b, 1)));
1192 }
1193 template <>
1195  return vcombine_u64(vdup_n_u64(vgetq_lane_u64(a, 0) * vgetq_lane_u64(b, 0)),
1196  vdup_n_u64(vgetq_lane_u64(a, 1) * vgetq_lane_u64(b, 1)));
1197 }
1198 
1199 template <>
1201  eigen_assert(false && "packet integer division are not supported by NEON");
1202  return pset1<Packet4c>(0);
1203 }
1204 template <>
1206  eigen_assert(false && "packet integer division are not supported by NEON");
1207  return pset1<Packet8c>(0);
1208 }
1209 template <>
1211  eigen_assert(false && "packet integer division are not supported by NEON");
1212  return pset1<Packet16c>(0);
1213 }
1214 template <>
1216  eigen_assert(false && "packet integer division are not supported by NEON");
1217  return pset1<Packet4uc>(0);
1218 }
1219 template <>
1221  eigen_assert(false && "packet integer division are not supported by NEON");
1222  return pset1<Packet8uc>(0);
1223 }
1224 template <>
1226  eigen_assert(false && "packet integer division are not supported by NEON");
1227  return pset1<Packet16uc>(0);
1228 }
1229 template <>
1231  eigen_assert(false && "packet integer division are not supported by NEON");
1232  return pset1<Packet4s>(0);
1233 }
1234 template <>
1235 EIGEN_STRONG_INLINE Packet8s pdiv<Packet8s>(const Packet8s& /*a*/, const Packet8s& /*b*/) {
1236  eigen_assert(false && "packet integer division are not supported by NEON");
1237  return pset1<Packet8s>(0);
1238 }
1239 template <>
1241  eigen_assert(false && "packet integer division are not supported by NEON");
1242  return pset1<Packet4us>(0);
1243 }
1244 template <>
1245 EIGEN_STRONG_INLINE Packet8us pdiv<Packet8us>(const Packet8us& /*a*/, const Packet8us& /*b*/) {
1246  eigen_assert(false && "packet integer division are not supported by NEON");
1247  return pset1<Packet8us>(0);
1248 }
1249 template <>
1251  eigen_assert(false && "packet integer division are not supported by NEON");
1252  return pset1<Packet2i>(0);
1253 }
1254 template <>
1255 EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/) {
1256  eigen_assert(false && "packet integer division are not supported by NEON");
1257  return pset1<Packet4i>(0);
1258 }
1259 template <>
1261  eigen_assert(false && "packet integer division are not supported by NEON");
1262  return pset1<Packet2ui>(0);
1263 }
1264 template <>
1265 EIGEN_STRONG_INLINE Packet4ui pdiv<Packet4ui>(const Packet4ui& /*a*/, const Packet4ui& /*b*/) {
1266  eigen_assert(false && "packet integer division are not supported by NEON");
1267  return pset1<Packet4ui>(0);
1268 }
1269 template <>
1270 EIGEN_STRONG_INLINE Packet2l pdiv<Packet2l>(const Packet2l& /*a*/, const Packet2l& /*b*/) {
1271  eigen_assert(false && "packet integer division are not supported by NEON");
1272  return pset1<Packet2l>(0LL);
1273 }
1274 template <>
1275 EIGEN_STRONG_INLINE Packet2ul pdiv<Packet2ul>(const Packet2ul& /*a*/, const Packet2ul& /*b*/) {
1276  eigen_assert(false && "packet integer division are not supported by NEON");
1277  return pset1<Packet2ul>(0ULL);
1278 }
1279 
1280 #ifdef EIGEN_VECTORIZE_FMA
1281 template <>
1282 EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
1283  return vfmaq_f32(c, a, b);
1284 }
1285 template <>
1286 EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
1287  return vfma_f32(c, a, b);
1288 }
1289 #else
1290 template <>
1291 EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
1292  return vmlaq_f32(c, a, b);
1293 }
1294 template <>
1296  return vmla_f32(c, a, b);
1297 }
1298 #endif
1299 
1300 // No FMA instruction for int, so use MLA unconditionally.
1301 template <>
1303  return vget_lane_s32(
1304  vreinterpret_s32_s8(vmla_s8(vreinterpret_s8_s32(vdup_n_s32(c)), vreinterpret_s8_s32(vdup_n_s32(a)),
1305  vreinterpret_s8_s32(vdup_n_s32(b)))),
1306  0);
1307 }
1308 template <>
1310  return vmla_s8(c, a, b);
1311 }
1312 template <>
1314  return vmlaq_s8(c, a, b);
1315 }
1316 template <>
1318  return vget_lane_u32(
1319  vreinterpret_u32_u8(vmla_u8(vreinterpret_u8_u32(vdup_n_u32(c)), vreinterpret_u8_u32(vdup_n_u32(a)),
1320  vreinterpret_u8_u32(vdup_n_u32(b)))),
1321  0);
1322 }
1323 template <>
1325  return vmla_u8(c, a, b);
1326 }
1327 template <>
1329  return vmlaq_u8(c, a, b);
1330 }
1331 template <>
1333  return vmla_s16(c, a, b);
1334 }
1335 template <>
1336 EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) {
1337  return vmlaq_s16(c, a, b);
1338 }
1339 template <>
1341  return vmla_u16(c, a, b);
1342 }
1343 template <>
1345  return vmlaq_u16(c, a, b);
1346 }
1347 template <>
1349  return vmla_s32(c, a, b);
1350 }
1351 template <>
1352 EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
1353  return vmlaq_s32(c, a, b);
1354 }
1355 template <>
1357  return vmla_u32(c, a, b);
1358 }
1359 template <>
1361  return vmlaq_u32(c, a, b);
1362 }
1363 
1364 template <>
1366  return vabd_f32(a, b);
1367 }
1368 template <>
1370  return vabdq_f32(a, b);
1371 }
1372 template <>
1374  return vget_lane_s32(
1375  vreinterpret_s32_s8(vabd_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1376 }
1377 template <>
1379  return vabd_s8(a, b);
1380 }
1381 template <>
1383  return vabdq_s8(a, b);
1384 }
1385 template <>
1387  return vget_lane_u32(
1388  vreinterpret_u32_u8(vabd_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1389 }
1390 template <>
1392  return vabd_u8(a, b);
1393 }
1394 template <>
1396  return vabdq_u8(a, b);
1397 }
1398 template <>
1400  return vabd_s16(a, b);
1401 }
1402 template <>
1404  return vabdq_s16(a, b);
1405 }
1406 template <>
1408  return vabd_u16(a, b);
1409 }
1410 template <>
1412  return vabdq_u16(a, b);
1413 }
1414 template <>
1416  return vabd_s32(a, b);
1417 }
1418 template <>
1420  return vabdq_s32(a, b);
1421 }
1422 template <>
1424  return vabd_u32(a, b);
1425 }
1426 template <>
1428  return vabdq_u32(a, b);
1429 }
1430 
1431 template <>
1433  return vmin_f32(a, b);
1434 }
1435 template <>
1437  return vminq_f32(a, b);
1438 }
1439 
1440 #ifdef __ARM_FEATURE_NUMERIC_MAXMIN
1441 // numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8
1442 // systems).
1443 template <>
1445  return vminnmq_f32(a, b);
1446 }
1447 template <>
1448 EIGEN_STRONG_INLINE Packet2f pmin<PropagateNumbers, Packet2f>(const Packet2f& a, const Packet2f& b) {
1449  return vminnm_f32(a, b);
1450 }
1451 #endif
1452 
1453 template <>
1455  return pmin<Packet4f>(a, b);
1456 }
1457 
1458 template <>
1460  return pmin<Packet2f>(a, b);
1461 }
1462 
1463 template <>
1465  return vget_lane_s32(
1466  vreinterpret_s32_s8(vmin_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1467 }
1468 template <>
1470  return vmin_s8(a, b);
1471 }
1472 template <>
1474  return vminq_s8(a, b);
1475 }
1476 template <>
1478  return vget_lane_u32(
1479  vreinterpret_u32_u8(vmin_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1480 }
1481 template <>
1483  return vmin_u8(a, b);
1484 }
1485 template <>
1487  return vminq_u8(a, b);
1488 }
1489 template <>
1491  return vmin_s16(a, b);
1492 }
1493 template <>
1495  return vminq_s16(a, b);
1496 }
1497 template <>
1499  return vmin_u16(a, b);
1500 }
1501 template <>
1503  return vminq_u16(a, b);
1504 }
1505 template <>
1507  return vmin_s32(a, b);
1508 }
1509 template <>
1511  return vminq_s32(a, b);
1512 }
1513 template <>
1515  return vmin_u32(a, b);
1516 }
1517 template <>
1519  return vminq_u32(a, b);
1520 }
1521 template <>
1523  return vcombine_s64(vdup_n_s64((std::min)(vgetq_lane_s64(a, 0), vgetq_lane_s64(b, 0))),
1524  vdup_n_s64((std::min)(vgetq_lane_s64(a, 1), vgetq_lane_s64(b, 1))));
1525 }
1526 template <>
1528  return vcombine_u64(vdup_n_u64((std::min)(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0))),
1529  vdup_n_u64((std::min)(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1))));
1530 }
1531 
1532 template <>
1534  return vmax_f32(a, b);
1535 }
1536 template <>
1538  return vmaxq_f32(a, b);
1539 }
1540 
1541 #ifdef __ARM_FEATURE_NUMERIC_MAXMIN
1542 // numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8
1543 // systems).
1544 template <>
1546  return vmaxnmq_f32(a, b);
1547 }
1548 template <>
1549 EIGEN_STRONG_INLINE Packet2f pmax<PropagateNumbers, Packet2f>(const Packet2f& a, const Packet2f& b) {
1550  return vmaxnm_f32(a, b);
1551 }
1552 #endif
1553 
1554 template <>
1556  return pmax<Packet4f>(a, b);
1557 }
1558 
1559 template <>
1561  return pmax<Packet2f>(a, b);
1562 }
1563 
1564 template <>
1566  return vget_lane_s32(
1567  vreinterpret_s32_s8(vmax_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1568 }
1569 template <>
1571  return vmax_s8(a, b);
1572 }
1573 template <>
1575  return vmaxq_s8(a, b);
1576 }
1577 template <>
1579  return vget_lane_u32(
1580  vreinterpret_u32_u8(vmax_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1581 }
1582 template <>
1584  return vmax_u8(a, b);
1585 }
1586 template <>
1588  return vmaxq_u8(a, b);
1589 }
1590 template <>
1592  return vmax_s16(a, b);
1593 }
1594 template <>
1596  return vmaxq_s16(a, b);
1597 }
1598 template <>
1600  return vmax_u16(a, b);
1601 }
1602 template <>
1604  return vmaxq_u16(a, b);
1605 }
1606 template <>
1608  return vmax_s32(a, b);
1609 }
1610 template <>
1612  return vmaxq_s32(a, b);
1613 }
1614 template <>
1616  return vmax_u32(a, b);
1617 }
1618 template <>
1620  return vmaxq_u32(a, b);
1621 }
1622 template <>
1624  return vcombine_s64(vdup_n_s64((std::max)(vgetq_lane_s64(a, 0), vgetq_lane_s64(b, 0))),
1625  vdup_n_s64((std::max)(vgetq_lane_s64(a, 1), vgetq_lane_s64(b, 1))));
1626 }
1627 template <>
1629  return vcombine_u64(vdup_n_u64((std::max)(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0))),
1630  vdup_n_u64((std::max)(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1))));
1631 }
1632 
1633 template <>
1635  return vreinterpret_f32_u32(vcle_f32(a, b));
1636 }
1637 template <>
1639  return vreinterpretq_f32_u32(vcleq_f32(a, b));
1640 }
1641 template <>
1643  return vget_lane_s32(
1644  vreinterpret_s32_u8(vcle_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1645 }
1646 template <>
1648  return vreinterpret_s8_u8(vcle_s8(a, b));
1649 }
1650 template <>
1652  return vreinterpretq_s8_u8(vcleq_s8(a, b));
1653 }
1654 template <>
1656  return vget_lane_u32(
1657  vreinterpret_u32_u8(vcle_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1658 }
1659 template <>
1661  return vcle_u8(a, b);
1662 }
1663 template <>
1665  return vcleq_u8(a, b);
1666 }
1667 template <>
1669  return vreinterpret_s16_u16(vcle_s16(a, b));
1670 }
1671 template <>
1673  return vreinterpretq_s16_u16(vcleq_s16(a, b));
1674 }
1675 template <>
1677  return vcle_u16(a, b);
1678 }
1679 template <>
1681  return vcleq_u16(a, b);
1682 }
1683 template <>
1685  return vreinterpret_s32_u32(vcle_s32(a, b));
1686 }
1687 template <>
1689  return vreinterpretq_s32_u32(vcleq_s32(a, b));
1690 }
1691 template <>
1693  return vcle_u32(a, b);
1694 }
1695 template <>
1697  return vcleq_u32(a, b);
1698 }
1699 template <>
1701 #if EIGEN_ARCH_ARM64
1702  return vreinterpretq_s64_u64(vcleq_s64(a, b));
1703 #else
1704  return vcombine_s64(vdup_n_s64(vgetq_lane_s64(a, 0) <= vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
1705  vdup_n_s64(vgetq_lane_s64(a, 1) <= vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
1706 #endif
1707 }
1708 template <>
1710 #if EIGEN_ARCH_ARM64
1711  return vcleq_u64(a, b);
1712 #else
1713  return vcombine_u64(vdup_n_u64(vgetq_lane_u64(a, 0) <= vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
1714  vdup_n_u64(vgetq_lane_u64(a, 1) <= vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
1715 #endif
1716 }
1717 
1718 template <>
1720  return vreinterpret_f32_u32(vclt_f32(a, b));
1721 }
1722 template <>
1724  return vreinterpretq_f32_u32(vcltq_f32(a, b));
1725 }
1726 template <>
1728  return vget_lane_s32(
1729  vreinterpret_s32_u8(vclt_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1730 }
1731 template <>
1733  return vreinterpret_s8_u8(vclt_s8(a, b));
1734 }
1735 template <>
1737  return vreinterpretq_s8_u8(vcltq_s8(a, b));
1738 }
1739 template <>
1741  return vget_lane_u32(
1742  vreinterpret_u32_u8(vclt_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1743 }
1744 template <>
1746  return vclt_u8(a, b);
1747 }
1748 template <>
1750  return vcltq_u8(a, b);
1751 }
1752 template <>
1754  return vreinterpret_s16_u16(vclt_s16(a, b));
1755 }
1756 template <>
1758  return vreinterpretq_s16_u16(vcltq_s16(a, b));
1759 }
1760 template <>
1762  return vclt_u16(a, b);
1763 }
1764 template <>
1766  return vcltq_u16(a, b);
1767 }
1768 template <>
1770  return vreinterpret_s32_u32(vclt_s32(a, b));
1771 }
1772 template <>
1774  return vreinterpretq_s32_u32(vcltq_s32(a, b));
1775 }
1776 template <>
1778  return vclt_u32(a, b);
1779 }
1780 template <>
1782  return vcltq_u32(a, b);
1783 }
1784 template <>
1786 #if EIGEN_ARCH_ARM64
1787  return vreinterpretq_s64_u64(vcltq_s64(a, b));
1788 #else
1789  return vcombine_s64(vdup_n_s64(vgetq_lane_s64(a, 0) < vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
1790  vdup_n_s64(vgetq_lane_s64(a, 1) < vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
1791 #endif
1792 }
1793 template <>
1795 #if EIGEN_ARCH_ARM64
1796  return vcltq_u64(a, b);
1797 #else
1798  return vcombine_u64(vdup_n_u64(vgetq_lane_u64(a, 0) < vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
1799  vdup_n_u64(vgetq_lane_u64(a, 1) < vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
1800 #endif
1801 }
1802 
1803 template <>
1805  return vreinterpret_f32_u32(vceq_f32(a, b));
1806 }
1807 template <>
1809  return vreinterpretq_f32_u32(vceqq_f32(a, b));
1810 }
1811 template <>
1813  return vget_lane_s32(
1814  vreinterpret_s32_u8(vceq_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1815 }
1816 template <>
1818  return vreinterpret_s8_u8(vceq_s8(a, b));
1819 }
1820 template <>
1822  return vreinterpretq_s8_u8(vceqq_s8(a, b));
1823 }
1824 template <>
1826  return vget_lane_u32(
1827  vreinterpret_u32_u8(vceq_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1828 }
1829 template <>
1831  return vceq_u8(a, b);
1832 }
1833 template <>
1835  return vceqq_u8(a, b);
1836 }
1837 template <>
1839  return vreinterpret_s16_u16(vceq_s16(a, b));
1840 }
1841 template <>
1843  return vreinterpretq_s16_u16(vceqq_s16(a, b));
1844 }
1845 template <>
1847  return vceq_u16(a, b);
1848 }
1849 template <>
1851  return vceqq_u16(a, b);
1852 }
1853 template <>
1855  return vreinterpret_s32_u32(vceq_s32(a, b));
1856 }
1857 template <>
1859  return vreinterpretq_s32_u32(vceqq_s32(a, b));
1860 }
1861 template <>
1863  return vceq_u32(a, b);
1864 }
1865 template <>
1867  return vceqq_u32(a, b);
1868 }
1869 template <>
1871 #if EIGEN_ARCH_ARM64
1872  return vreinterpretq_s64_u64(vceqq_s64(a, b));
1873 #else
1874  return vcombine_s64(vdup_n_s64(vgetq_lane_s64(a, 0) == vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
1875  vdup_n_s64(vgetq_lane_s64(a, 1) == vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
1876 #endif
1877 }
1878 template <>
1880 #if EIGEN_ARCH_ARM64
1881  return vceqq_u64(a, b);
1882 #else
1883  return vcombine_u64(vdup_n_u64(vgetq_lane_u64(a, 0) == vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
1884  vdup_n_u64(vgetq_lane_u64(a, 1) == vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
1885 #endif
1886 }
1887 
1888 template <>
1890  return vreinterpret_f32_u32(vmvn_u32(vcge_f32(a, b)));
1891 }
1892 template <>
1894  return vreinterpretq_f32_u32(vmvnq_u32(vcgeq_f32(a, b)));
1895 }
1896 
1897 // Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
1898 template <>
1900  return vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(a), vreinterpret_u32_f32(b)));
1901 }
1902 template <>
1904  return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));
1905 }
1906 template <>
1908  return a & b;
1909 }
1910 template <>
1912  return vand_s8(a, b);
1913 }
1914 template <>
1916  return vandq_s8(a, b);
1917 }
1918 template <>
1920  return a & b;
1921 }
1922 template <>
1924  return vand_u8(a, b);
1925 }
1926 template <>
1928  return vandq_u8(a, b);
1929 }
1930 template <>
1932  return vand_s16(a, b);
1933 }
1934 template <>
1936  return vandq_s16(a, b);
1937 }
1938 template <>
1940  return vand_u16(a, b);
1941 }
1942 template <>
1944  return vandq_u16(a, b);
1945 }
1946 template <>
1948  return vand_s32(a, b);
1949 }
1950 template <>
1952  return vandq_s32(a, b);
1953 }
1954 template <>
1956  return vand_u32(a, b);
1957 }
1958 template <>
1960  return vandq_u32(a, b);
1961 }
1962 template <>
1964  return vandq_s64(a, b);
1965 }
1966 template <>
1968  return vandq_u64(a, b);
1969 }
1970 
1971 template <>
1973  return vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(a), vreinterpret_u32_f32(b)));
1974 }
1975 template <>
1977  return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));
1978 }
1979 template <>
1981  return a | b;
1982 }
1983 template <>
1985  return vorr_s8(a, b);
1986 }
1987 template <>
1989  return vorrq_s8(a, b);
1990 }
1991 template <>
1993  return a | b;
1994 }
1995 template <>
1997  return vorr_u8(a, b);
1998 }
1999 template <>
2001  return vorrq_u8(a, b);
2002 }
2003 template <>
2005  return vorr_s16(a, b);
2006 }
2007 template <>
2009  return vorrq_s16(a, b);
2010 }
2011 template <>
2013  return vorr_u16(a, b);
2014 }
2015 template <>
2017  return vorrq_u16(a, b);
2018 }
2019 template <>
2021  return vorr_s32(a, b);
2022 }
2023 template <>
2025  return vorrq_s32(a, b);
2026 }
2027 template <>
2029  return vorr_u32(a, b);
2030 }
2031 template <>
2033  return vorrq_u32(a, b);
2034 }
2035 template <>
2037  return vorrq_s64(a, b);
2038 }
2039 template <>
2041  return vorrq_u64(a, b);
2042 }
2043 
2044 template <>
2046  return vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(a), vreinterpret_u32_f32(b)));
2047 }
2048 template <>
2050  return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));
2051 }
2052 template <>
2054  return a ^ b;
2055 }
2056 template <>
2058  return veor_s8(a, b);
2059 }
2060 template <>
2062  return veorq_s8(a, b);
2063 }
2064 template <>
2066  return a ^ b;
2067 }
2068 template <>
2070  return veor_u8(a, b);
2071 }
2072 template <>
2074  return veorq_u8(a, b);
2075 }
2076 template <>
2078  return veor_s16(a, b);
2079 }
2080 template <>
2082  return veorq_s16(a, b);
2083 }
2084 template <>
2086  return veor_u16(a, b);
2087 }
2088 template <>
2090  return veorq_u16(a, b);
2091 }
2092 template <>
2094  return veor_s32(a, b);
2095 }
2096 template <>
2098  return veorq_s32(a, b);
2099 }
2100 template <>
2102  return veor_u32(a, b);
2103 }
2104 template <>
2106  return veorq_u32(a, b);
2107 }
2108 template <>
2110  return veorq_s64(a, b);
2111 }
2112 template <>
2114  return veorq_u64(a, b);
2115 }
2116 
2117 template <>
2119  return vreinterpret_f32_u32(vbic_u32(vreinterpret_u32_f32(a), vreinterpret_u32_f32(b)));
2120 }
2121 template <>
2123  return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));
2124 }
2125 template <>
2127  return a & ~b;
2128 }
2129 template <>
2131  return vbic_s8(a, b);
2132 }
2133 template <>
2135  return vbicq_s8(a, b);
2136 }
2137 template <>
2139  return a & ~b;
2140 }
2141 template <>
2143  return vbic_u8(a, b);
2144 }
2145 template <>
2147  return vbicq_u8(a, b);
2148 }
2149 template <>
2151  return vbic_s16(a, b);
2152 }
2153 template <>
2155  return vbicq_s16(a, b);
2156 }
2157 template <>
2159  return vbic_u16(a, b);
2160 }
2161 template <>
2163  return vbicq_u16(a, b);
2164 }
2165 template <>
2167  return vbic_s32(a, b);
2168 }
2169 template <>
2171  return vbicq_s32(a, b);
2172 }
2173 template <>
2175  return vbic_u32(a, b);
2176 }
2177 template <>
2179  return vbicq_u32(a, b);
2180 }
2181 template <>
2183  return vbicq_s64(a, b);
2184 }
2185 template <>
2187  return vbicq_u64(a, b);
2188 }
2189 
2190 template <int N>
2192  return vget_lane_s32(vreinterpret_s32_s8(vshr_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0);
2193 }
2194 template <int N>
2196  return vshr_n_s8(a, N);
2197 }
2198 template <int N>
2200  return vshrq_n_s8(a, N);
2201 }
2202 template <int N>
2204  return vget_lane_u32(vreinterpret_u32_u8(vshr_n_u8(vreinterpret_u8_u32(vdup_n_u32(a)), N)), 0);
2205 }
2206 template <int N>
2208  return vshr_n_u8(a, N);
2209 }
2210 template <int N>
2212  return vshrq_n_u8(a, N);
2213 }
2214 template <int N>
2216  return vshr_n_s16(a, N);
2217 }
2218 template <int N>
2220  return vshrq_n_s16(a, N);
2221 }
2222 template <int N>
2224  return vshr_n_u16(a, N);
2225 }
2226 template <int N>
2228  return vshrq_n_u16(a, N);
2229 }
2230 template <int N>
2232  return vshr_n_s32(a, N);
2233 }
2234 template <int N>
2236  return vshrq_n_s32(a, N);
2237 }
2238 template <int N>
2240  return vshr_n_u32(a, N);
2241 }
2242 template <int N>
2244  return vshrq_n_u32(a, N);
2245 }
2246 template <int N>
2248  return vshrq_n_s64(a, N);
2249 }
2250 template <int N>
2252  return vshrq_n_u64(a, N);
2253 }
2254 
2255 template <int N>
2257  return vget_lane_s32(vreinterpret_s32_u8(vshr_n_u8(vreinterpret_u8_s32(vdup_n_s32(a)), N)), 0);
2258 }
2259 template <int N>
2261  return vreinterpret_s8_u8(vshr_n_u8(vreinterpret_u8_s8(a), N));
2262 }
2263 template <int N>
2265  return vreinterpretq_s8_u8(vshrq_n_u8(vreinterpretq_u8_s8(a), N));
2266 }
2267 template <int N>
2269  return vget_lane_u32(vreinterpret_u32_s8(vshr_n_s8(vreinterpret_s8_u32(vdup_n_u32(a)), N)), 0);
2270 }
2271 template <int N>
2273  return vshr_n_u8(a, N);
2274 }
2275 template <int N>
2277  return vshrq_n_u8(a, N);
2278 }
2279 template <int N>
2281  return vreinterpret_s16_u16(vshr_n_u16(vreinterpret_u16_s16(a), N));
2282 }
2283 template <int N>
2285  return vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(a), N));
2286 }
2287 template <int N>
2289  return vshr_n_u16(a, N);
2290 }
2291 template <int N>
2293  return vshrq_n_u16(a, N);
2294 }
2295 template <int N>
2297  return vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(a), N));
2298 }
2299 template <int N>
2301  return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), N));
2302 }
2303 template <int N>
2305  return vshr_n_u32(a, N);
2306 }
2307 template <int N>
2309  return vshrq_n_u32(a, N);
2310 }
2311 template <int N>
2313  return vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(a), N));
2314 }
2315 template <int N>
2317  return vshrq_n_u64(a, N);
2318 }
2319 
2320 template <int N>
2322  return vget_lane_s32(vreinterpret_s32_s8(vshl_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0);
2323 }
2324 template <int N>
2326  return vshl_n_s8(a, N);
2327 }
2328 template <int N>
2330  return vshlq_n_s8(a, N);
2331 }
2332 template <int N>
2334  return vget_lane_u32(vreinterpret_u32_u8(vshl_n_u8(vreinterpret_u8_u32(vdup_n_u32(a)), N)), 0);
2335 }
2336 template <int N>
2338  return vshl_n_u8(a, N);
2339 }
2340 template <int N>
2342  return vshlq_n_u8(a, N);
2343 }
2344 template <int N>
2346  return vshl_n_s16(a, N);
2347 }
2348 template <int N>
2350  return vshlq_n_s16(a, N);
2351 }
2352 template <int N>
2354  return vshl_n_u16(a, N);
2355 }
2356 template <int N>
2358  return vshlq_n_u16(a, N);
2359 }
2360 template <int N>
2362  return vshl_n_s32(a, N);
2363 }
2364 template <int N>
2366  return vshlq_n_s32(a, N);
2367 }
2368 template <int N>
2370  return vshl_n_u32(a, N);
2371 }
2372 template <int N>
2374  return vshlq_n_u32(a, N);
2375 }
2376 template <int N>
2378  return vshlq_n_s64(a, N);
2379 }
2380 template <int N>
2382  return vshlq_n_u64(a, N);
2383 }
2384 
2385 template <>
2387  EIGEN_DEBUG_ALIGNED_LOAD return vld1_f32(from);
2388 }
2389 template <>
2390 EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
2391  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from);
2392 }
2393 template <>
2395  Packet4c res;
2396  memcpy(&res, from, sizeof(Packet4c));
2397  return res;
2398 }
2399 template <>
2401  EIGEN_DEBUG_ALIGNED_LOAD return vld1_s8(from);
2402 }
2403 template <>
2405  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s8(from);
2406 }
2407 template <>
2409  Packet4uc res;
2410  memcpy(&res, from, sizeof(Packet4uc));
2411  return res;
2412 }
2413 template <>
2415  EIGEN_DEBUG_ALIGNED_LOAD return vld1_u8(from);
2416 }
2417 template <>
2419  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u8(from);
2420 }
2421 template <>
2423  EIGEN_DEBUG_ALIGNED_LOAD return vld1_s16(from);
2424 }
2425 template <>
2427  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s16(from);
2428 }
2429 template <>
2431  EIGEN_DEBUG_ALIGNED_LOAD return vld1_u16(from);
2432 }
2433 template <>
2435  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u16(from);
2436 }
2437 template <>
2439  EIGEN_DEBUG_ALIGNED_LOAD return vld1_s32(from);
2440 }
2441 template <>
2443  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from);
2444 }
2445 template <>
2447  EIGEN_DEBUG_ALIGNED_LOAD return vld1_u32(from);
2448 }
2449 template <>
2451  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u32(from);
2452 }
2453 template <>
2455  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s64(from);
2456 }
2457 template <>
2459  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u64(from);
2460 }
2461 
2462 template <>
2464  EIGEN_DEBUG_UNALIGNED_LOAD return vld1_f32(from);
2465 }
2466 template <>
2467 EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
2468  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from);
2469 }
2470 template <>
2472  Packet4c res;
2473  memcpy(&res, from, sizeof(Packet4c));
2474  return res;
2475 }
2476 template <>
2478  EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s8(from);
2479 }
2480 template <>
2482  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s8(from);
2483 }
2484 template <>
2486  Packet4uc res;
2487  memcpy(&res, from, sizeof(Packet4uc));
2488  return res;
2489 }
2490 template <>
2492  EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u8(from);
2493 }
2494 template <>
2496  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u8(from);
2497 }
2498 template <>
2500  EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s16(from);
2501 }
2502 template <>
2504  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s16(from);
2505 }
2506 template <>
2508  EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u16(from);
2509 }
2510 template <>
2512  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u16(from);
2513 }
2514 template <>
2516  EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s32(from);
2517 }
2518 template <>
2520  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from);
2521 }
2522 template <>
2524  EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u32(from);
2525 }
2526 template <>
2528  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u32(from);
2529 }
2530 template <>
2532  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s64(from);
2533 }
2534 template <>
2536  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u64(from);
2537 }
2538 
2539 template <>
2541  return vld1_dup_f32(from);
2542 }
2543 template <>
2544 EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
2545  return vcombine_f32(vld1_dup_f32(from), vld1_dup_f32(from + 1));
2546 }
2547 template <>
2549  const int8x8_t a = vreinterpret_s8_s32(vdup_n_s32(pload<Packet4c>(from)));
2550  return vget_lane_s32(vreinterpret_s32_s8(vzip_s8(a, a).val[0]), 0);
2551 }
2552 template <>
2554  const int8x8_t a = vld1_s8(from);
2555  return vzip_s8(a, a).val[0];
2556 }
2557 template <>
2559  const int8x8_t a = vld1_s8(from);
2560  const int8x8x2_t b = vzip_s8(a, a);
2561  return vcombine_s8(b.val[0], b.val[1]);
2562 }
2563 template <>
2565  const uint8x8_t a = vreinterpret_u8_u32(vdup_n_u32(pload<Packet4uc>(from)));
2566  return vget_lane_u32(vreinterpret_u32_u8(vzip_u8(a, a).val[0]), 0);
2567 }
2568 template <>
2570  const uint8x8_t a = vld1_u8(from);
2571  return vzip_u8(a, a).val[0];
2572 }
2573 template <>
2575  const uint8x8_t a = vld1_u8(from);
2576  const uint8x8x2_t b = vzip_u8(a, a);
2577  return vcombine_u8(b.val[0], b.val[1]);
2578 }
2579 template <>
2581  return vreinterpret_s16_u32(
2582  vzip_u32(vreinterpret_u32_s16(vld1_dup_s16(from)), vreinterpret_u32_s16(vld1_dup_s16(from + 1))).val[0]);
2583 }
2584 template <>
2586  const int16x4_t a = vld1_s16(from);
2587  const int16x4x2_t b = vzip_s16(a, a);
2588  return vcombine_s16(b.val[0], b.val[1]);
2589 }
2590 template <>
2592  return vreinterpret_u16_u32(
2593  vzip_u32(vreinterpret_u32_u16(vld1_dup_u16(from)), vreinterpret_u32_u16(vld1_dup_u16(from + 1))).val[0]);
2594 }
2595 template <>
2597  const uint16x4_t a = vld1_u16(from);
2598  const uint16x4x2_t b = vzip_u16(a, a);
2599  return vcombine_u16(b.val[0], b.val[1]);
2600 }
2601 template <>
2603  return vld1_dup_s32(from);
2604 }
2605 template <>
2607  return vcombine_s32(vld1_dup_s32(from), vld1_dup_s32(from + 1));
2608 }
2609 template <>
2611  return vld1_dup_u32(from);
2612 }
2613 template <>
2615  return vcombine_u32(vld1_dup_u32(from), vld1_dup_u32(from + 1));
2616 }
2617 template <>
2619  return vld1q_dup_s64(from);
2620 }
2621 template <>
2623  return vld1q_dup_u64(from);
2624 }
2625 
2626 template <>
2628  return vld1q_dup_f32(from);
2629 }
2630 template <>
2632  return vget_lane_s32(vreinterpret_s32_s8(vld1_dup_s8(from)), 0);
2633 }
2634 template <>
2636  return vreinterpret_s8_u32(
2637  vzip_u32(vreinterpret_u32_s8(vld1_dup_s8(from)), vreinterpret_u32_s8(vld1_dup_s8(from + 1))).val[0]);
2638 }
2639 template <>
2641  const int8x8_t a = vreinterpret_s8_u32(
2642  vzip_u32(vreinterpret_u32_s8(vld1_dup_s8(from)), vreinterpret_u32_s8(vld1_dup_s8(from + 1))).val[0]);
2643  const int8x8_t b = vreinterpret_s8_u32(
2644  vzip_u32(vreinterpret_u32_s8(vld1_dup_s8(from + 2)), vreinterpret_u32_s8(vld1_dup_s8(from + 3))).val[0]);
2645  return vcombine_s8(a, b);
2646 }
2647 template <>
2649  return vget_lane_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), 0);
2650 }
2651 template <>
2653  return vreinterpret_u8_u32(
2654  vzip_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), vreinterpret_u32_u8(vld1_dup_u8(from + 1))).val[0]);
2655 }
2656 template <>
2658  const uint8x8_t a = vreinterpret_u8_u32(
2659  vzip_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), vreinterpret_u32_u8(vld1_dup_u8(from + 1))).val[0]);
2660  const uint8x8_t b = vreinterpret_u8_u32(
2661  vzip_u32(vreinterpret_u32_u8(vld1_dup_u8(from + 2)), vreinterpret_u32_u8(vld1_dup_u8(from + 3))).val[0]);
2662  return vcombine_u8(a, b);
2663 }
2664 template <>
2666  return vcombine_s16(vld1_dup_s16(from), vld1_dup_s16(from + 1));
2667 }
2668 template <>
2670  return vcombine_u16(vld1_dup_u16(from), vld1_dup_u16(from + 1));
2671 }
2672 template <>
2674  return vld1q_dup_s32(from);
2675 }
2676 template <>
2678  return vld1q_dup_u32(from);
2679 }
2680 
2681 template <>
2682 EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet2f& from) {
2683  EIGEN_DEBUG_ALIGNED_STORE vst1_f32(to, from);
2684 }
2685 template <>
2686 EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
2687  EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from);
2688 }
2689 template <>
2691  memcpy(to, &from, sizeof(from));
2692 }
2693 template <>
2695  EIGEN_DEBUG_ALIGNED_STORE vst1_s8(to, from);
2696 }
2697 template <>
2698 EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet16c& from) {
2699  EIGEN_DEBUG_ALIGNED_STORE vst1q_s8(to, from);
2700 }
2701 template <>
2703  memcpy(to, &from, sizeof(from));
2704 }
2705 template <>
2707  EIGEN_DEBUG_ALIGNED_STORE vst1_u8(to, from);
2708 }
2709 template <>
2710 EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet16uc& from) {
2711  EIGEN_DEBUG_ALIGNED_STORE vst1q_u8(to, from);
2712 }
2713 template <>
2715  EIGEN_DEBUG_ALIGNED_STORE vst1_s16(to, from);
2716 }
2717 template <>
2718 EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet8s& from) {
2719  EIGEN_DEBUG_ALIGNED_STORE vst1q_s16(to, from);
2720 }
2721 template <>
2723  EIGEN_DEBUG_ALIGNED_STORE vst1_u16(to, from);
2724 }
2725 template <>
2726 EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet8us& from) {
2727  EIGEN_DEBUG_ALIGNED_STORE vst1q_u16(to, from);
2728 }
2729 template <>
2731  EIGEN_DEBUG_ALIGNED_STORE vst1_s32(to, from);
2732 }
2733 template <>
2734 EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from) {
2735  EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from);
2736 }
2737 template <>
2739  EIGEN_DEBUG_ALIGNED_STORE vst1_u32(to, from);
2740 }
2741 template <>
2742 EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet4ui& from) {
2743  EIGEN_DEBUG_ALIGNED_STORE vst1q_u32(to, from);
2744 }
2745 template <>
2746 EIGEN_STRONG_INLINE void pstore<int64_t>(int64_t* to, const Packet2l& from) {
2747  EIGEN_DEBUG_ALIGNED_STORE vst1q_s64(to, from);
2748 }
2749 template <>
2750 EIGEN_STRONG_INLINE void pstore<uint64_t>(uint64_t* to, const Packet2ul& from) {
2751  EIGEN_DEBUG_ALIGNED_STORE vst1q_u64(to, from);
2752 }
2753 
2754 template <>
2755 EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet2f& from) {
2756  EIGEN_DEBUG_UNALIGNED_STORE vst1_f32(to, from);
2757 }
2758 template <>
2759 EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
2760  EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from);
2761 }
2762 template <>
2764  memcpy(to, &from, sizeof(from));
2765 }
2766 template <>
2768  EIGEN_DEBUG_UNALIGNED_STORE vst1_s8(to, from);
2769 }
2770 template <>
2771 EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet16c& from) {
2772  EIGEN_DEBUG_UNALIGNED_STORE vst1q_s8(to, from);
2773 }
2774 template <>
2776  memcpy(to, &from, sizeof(from));
2777 }
2778 template <>
2780  EIGEN_DEBUG_UNALIGNED_STORE vst1_u8(to, from);
2781 }
2782 template <>
2783 EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet16uc& from) {
2784  EIGEN_DEBUG_UNALIGNED_STORE vst1q_u8(to, from);
2785 }
2786 template <>
2788  EIGEN_DEBUG_UNALIGNED_STORE vst1_s16(to, from);
2789 }
2790 template <>
2791 EIGEN_STRONG_INLINE void pstoreu<int16_t>(int16_t* to, const Packet8s& from) {
2792  EIGEN_DEBUG_UNALIGNED_STORE vst1q_s16(to, from);
2793 }
2794 template <>
2796  EIGEN_DEBUG_UNALIGNED_STORE vst1_u16(to, from);
2797 }
2798 template <>
2800  EIGEN_DEBUG_UNALIGNED_STORE vst1q_u16(to, from);
2801 }
2802 template <>
2804  EIGEN_DEBUG_UNALIGNED_STORE vst1_s32(to, from);
2805 }
2806 template <>
2807 EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from) {
2808  EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from);
2809 }
2810 template <>
2812  EIGEN_DEBUG_UNALIGNED_STORE vst1_u32(to, from);
2813 }
2814 template <>
2816  EIGEN_DEBUG_UNALIGNED_STORE vst1q_u32(to, from);
2817 }
2818 template <>
2819 EIGEN_STRONG_INLINE void pstoreu<int64_t>(int64_t* to, const Packet2l& from) {
2820  EIGEN_DEBUG_UNALIGNED_STORE vst1q_s64(to, from);
2821 }
2822 template <>
2824  EIGEN_DEBUG_UNALIGNED_STORE vst1q_u64(to, from);
2825 }
2826 
2827 template <>
2829  Packet2f res = vld1_dup_f32(from);
2830  res = vld1_lane_f32(from + 1 * stride, res, 1);
2831  return res;
2832 }
2833 template <>
2835  Packet4f res = vld1q_dup_f32(from);
2836  res = vld1q_lane_f32(from + 1 * stride, res, 1);
2837  res = vld1q_lane_f32(from + 2 * stride, res, 2);
2838  res = vld1q_lane_f32(from + 3 * stride, res, 3);
2839  return res;
2840 }
2841 template <>
2843  Packet4c res;
2844  for (int i = 0; i != 4; i++) reinterpret_cast<int8_t*>(&res)[i] = *(from + i * stride);
2845  return res;
2846 }
2847 template <>
2849  Packet8c res = vld1_dup_s8(from);
2850  res = vld1_lane_s8(from + 1 * stride, res, 1);
2851  res = vld1_lane_s8(from + 2 * stride, res, 2);
2852  res = vld1_lane_s8(from + 3 * stride, res, 3);
2853  res = vld1_lane_s8(from + 4 * stride, res, 4);
2854  res = vld1_lane_s8(from + 5 * stride, res, 5);
2855  res = vld1_lane_s8(from + 6 * stride, res, 6);
2856  res = vld1_lane_s8(from + 7 * stride, res, 7);
2857  return res;
2858 }
2859 template <>
2861  Packet16c res = vld1q_dup_s8(from);
2862  res = vld1q_lane_s8(from + 1 * stride, res, 1);
2863  res = vld1q_lane_s8(from + 2 * stride, res, 2);
2864  res = vld1q_lane_s8(from + 3 * stride, res, 3);
2865  res = vld1q_lane_s8(from + 4 * stride, res, 4);
2866  res = vld1q_lane_s8(from + 5 * stride, res, 5);
2867  res = vld1q_lane_s8(from + 6 * stride, res, 6);
2868  res = vld1q_lane_s8(from + 7 * stride, res, 7);
2869  res = vld1q_lane_s8(from + 8 * stride, res, 8);
2870  res = vld1q_lane_s8(from + 9 * stride, res, 9);
2871  res = vld1q_lane_s8(from + 10 * stride, res, 10);
2872  res = vld1q_lane_s8(from + 11 * stride, res, 11);
2873  res = vld1q_lane_s8(from + 12 * stride, res, 12);
2874  res = vld1q_lane_s8(from + 13 * stride, res, 13);
2875  res = vld1q_lane_s8(from + 14 * stride, res, 14);
2876  res = vld1q_lane_s8(from + 15 * stride, res, 15);
2877  return res;
2878 }
2879 template <>
2881  Packet4uc res;
2882  for (int i = 0; i != 4; i++) reinterpret_cast<uint8_t*>(&res)[i] = *(from + i * stride);
2883  return res;
2884 }
2885 template <>
2887  Packet8uc res = vld1_dup_u8(from);
2888  res = vld1_lane_u8(from + 1 * stride, res, 1);
2889  res = vld1_lane_u8(from + 2 * stride, res, 2);
2890  res = vld1_lane_u8(from + 3 * stride, res, 3);
2891  res = vld1_lane_u8(from + 4 * stride, res, 4);
2892  res = vld1_lane_u8(from + 5 * stride, res, 5);
2893  res = vld1_lane_u8(from + 6 * stride, res, 6);
2894  res = vld1_lane_u8(from + 7 * stride, res, 7);
2895  return res;
2896 }
2897 template <>
2899  Packet16uc res = vld1q_dup_u8(from);
2900  res = vld1q_lane_u8(from + 1 * stride, res, 1);
2901  res = vld1q_lane_u8(from + 2 * stride, res, 2);
2902  res = vld1q_lane_u8(from + 3 * stride, res, 3);
2903  res = vld1q_lane_u8(from + 4 * stride, res, 4);
2904  res = vld1q_lane_u8(from + 5 * stride, res, 5);
2905  res = vld1q_lane_u8(from + 6 * stride, res, 6);
2906  res = vld1q_lane_u8(from + 7 * stride, res, 7);
2907  res = vld1q_lane_u8(from + 8 * stride, res, 8);
2908  res = vld1q_lane_u8(from + 9 * stride, res, 9);
2909  res = vld1q_lane_u8(from + 10 * stride, res, 10);
2910  res = vld1q_lane_u8(from + 11 * stride, res, 11);
2911  res = vld1q_lane_u8(from + 12 * stride, res, 12);
2912  res = vld1q_lane_u8(from + 13 * stride, res, 13);
2913  res = vld1q_lane_u8(from + 14 * stride, res, 14);
2914  res = vld1q_lane_u8(from + 15 * stride, res, 15);
2915  return res;
2916 }
2917 template <>
2919  Packet4s res = vld1_dup_s16(from);
2920  res = vld1_lane_s16(from + 1 * stride, res, 1);
2921  res = vld1_lane_s16(from + 2 * stride, res, 2);
2922  res = vld1_lane_s16(from + 3 * stride, res, 3);
2923  return res;
2924 }
2925 template <>
2927  Packet8s res = vld1q_dup_s16(from);
2928  res = vld1q_lane_s16(from + 1 * stride, res, 1);
2929  res = vld1q_lane_s16(from + 2 * stride, res, 2);
2930  res = vld1q_lane_s16(from + 3 * stride, res, 3);
2931  res = vld1q_lane_s16(from + 4 * stride, res, 4);
2932  res = vld1q_lane_s16(from + 5 * stride, res, 5);
2933  res = vld1q_lane_s16(from + 6 * stride, res, 6);
2934  res = vld1q_lane_s16(from + 7 * stride, res, 7);
2935  return res;
2936 }
2937 template <>
2939  Packet4us res = vld1_dup_u16(from);
2940  res = vld1_lane_u16(from + 1 * stride, res, 1);
2941  res = vld1_lane_u16(from + 2 * stride, res, 2);
2942  res = vld1_lane_u16(from + 3 * stride, res, 3);
2943  return res;
2944 }
2945 template <>
2947  Packet8us res = vld1q_dup_u16(from);
2948  res = vld1q_lane_u16(from + 1 * stride, res, 1);
2949  res = vld1q_lane_u16(from + 2 * stride, res, 2);
2950  res = vld1q_lane_u16(from + 3 * stride, res, 3);
2951  res = vld1q_lane_u16(from + 4 * stride, res, 4);
2952  res = vld1q_lane_u16(from + 5 * stride, res, 5);
2953  res = vld1q_lane_u16(from + 6 * stride, res, 6);
2954  res = vld1q_lane_u16(from + 7 * stride, res, 7);
2955  return res;
2956 }
2957 template <>
2959  Packet2i res = vld1_dup_s32(from);
2960  res = vld1_lane_s32(from + 1 * stride, res, 1);
2961  return res;
2962 }
2963 template <>
2965  Packet4i res = vld1q_dup_s32(from);
2966  res = vld1q_lane_s32(from + 1 * stride, res, 1);
2967  res = vld1q_lane_s32(from + 2 * stride, res, 2);
2968  res = vld1q_lane_s32(from + 3 * stride, res, 3);
2969  return res;
2970 }
2971 template <>
2973  Packet2ui res = vld1_dup_u32(from);
2974  res = vld1_lane_u32(from + 1 * stride, res, 1);
2975  return res;
2976 }
2977 template <>
2979  Packet4ui res = vld1q_dup_u32(from);
2980  res = vld1q_lane_u32(from + 1 * stride, res, 1);
2981  res = vld1q_lane_u32(from + 2 * stride, res, 2);
2982  res = vld1q_lane_u32(from + 3 * stride, res, 3);
2983  return res;
2984 }
2985 template <>
2987  Packet2l res = vld1q_dup_s64(from);
2988  res = vld1q_lane_s64(from + 1 * stride, res, 1);
2989  return res;
2990 }
2991 template <>
2993  Packet2ul res = vld1q_dup_u64(from);
2994  res = vld1q_lane_u64(from + 1 * stride, res, 1);
2995  return res;
2996 }
2997 
2998 template <>
3000  vst1_lane_f32(to + stride * 0, from, 0);
3001  vst1_lane_f32(to + stride * 1, from, 1);
3002 }
3003 template <>
3004 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
3005  vst1q_lane_f32(to + stride * 0, from, 0);
3006  vst1q_lane_f32(to + stride * 1, from, 1);
3007  vst1q_lane_f32(to + stride * 2, from, 2);
3008  vst1q_lane_f32(to + stride * 3, from, 3);
3009 }
3010 template <>
3012  for (int i = 0; i != 4; i++) *(to + i * stride) = reinterpret_cast<const int8_t*>(&from)[i];
3013 }
3014 template <>
3016  vst1_lane_s8(to + stride * 0, from, 0);
3017  vst1_lane_s8(to + stride * 1, from, 1);
3018  vst1_lane_s8(to + stride * 2, from, 2);
3019  vst1_lane_s8(to + stride * 3, from, 3);
3020  vst1_lane_s8(to + stride * 4, from, 4);
3021  vst1_lane_s8(to + stride * 5, from, 5);
3022  vst1_lane_s8(to + stride * 6, from, 6);
3023  vst1_lane_s8(to + stride * 7, from, 7);
3024 }
3025 template <>
3027  Index stride) {
3028  vst1q_lane_s8(to + stride * 0, from, 0);
3029  vst1q_lane_s8(to + stride * 1, from, 1);
3030  vst1q_lane_s8(to + stride * 2, from, 2);
3031  vst1q_lane_s8(to + stride * 3, from, 3);
3032  vst1q_lane_s8(to + stride * 4, from, 4);
3033  vst1q_lane_s8(to + stride * 5, from, 5);
3034  vst1q_lane_s8(to + stride * 6, from, 6);
3035  vst1q_lane_s8(to + stride * 7, from, 7);
3036  vst1q_lane_s8(to + stride * 8, from, 8);
3037  vst1q_lane_s8(to + stride * 9, from, 9);
3038  vst1q_lane_s8(to + stride * 10, from, 10);
3039  vst1q_lane_s8(to + stride * 11, from, 11);
3040  vst1q_lane_s8(to + stride * 12, from, 12);
3041  vst1q_lane_s8(to + stride * 13, from, 13);
3042  vst1q_lane_s8(to + stride * 14, from, 14);
3043  vst1q_lane_s8(to + stride * 15, from, 15);
3044 }
3045 template <>
3047  Index stride) {
3048  for (int i = 0; i != 4; i++) *(to + i * stride) = reinterpret_cast<const uint8_t*>(&from)[i];
3049 }
3050 template <>
3052  Index stride) {
3053  vst1_lane_u8(to + stride * 0, from, 0);
3054  vst1_lane_u8(to + stride * 1, from, 1);
3055  vst1_lane_u8(to + stride * 2, from, 2);
3056  vst1_lane_u8(to + stride * 3, from, 3);
3057  vst1_lane_u8(to + stride * 4, from, 4);
3058  vst1_lane_u8(to + stride * 5, from, 5);
3059  vst1_lane_u8(to + stride * 6, from, 6);
3060  vst1_lane_u8(to + stride * 7, from, 7);
3061 }
3062 template <>
3064  Index stride) {
3065  vst1q_lane_u8(to + stride * 0, from, 0);
3066  vst1q_lane_u8(to + stride * 1, from, 1);
3067  vst1q_lane_u8(to + stride * 2, from, 2);
3068  vst1q_lane_u8(to + stride * 3, from, 3);
3069  vst1q_lane_u8(to + stride * 4, from, 4);
3070  vst1q_lane_u8(to + stride * 5, from, 5);
3071  vst1q_lane_u8(to + stride * 6, from, 6);
3072  vst1q_lane_u8(to + stride * 7, from, 7);
3073  vst1q_lane_u8(to + stride * 8, from, 8);
3074  vst1q_lane_u8(to + stride * 9, from, 9);
3075  vst1q_lane_u8(to + stride * 10, from, 10);
3076  vst1q_lane_u8(to + stride * 11, from, 11);
3077  vst1q_lane_u8(to + stride * 12, from, 12);
3078  vst1q_lane_u8(to + stride * 13, from, 13);
3079  vst1q_lane_u8(to + stride * 14, from, 14);
3080  vst1q_lane_u8(to + stride * 15, from, 15);
3081 }
3082 template <>
3084  Index stride) {
3085  vst1_lane_s16(to + stride * 0, from, 0);
3086  vst1_lane_s16(to + stride * 1, from, 1);
3087  vst1_lane_s16(to + stride * 2, from, 2);
3088  vst1_lane_s16(to + stride * 3, from, 3);
3089 }
3090 template <>
3092  Index stride) {
3093  vst1q_lane_s16(to + stride * 0, from, 0);
3094  vst1q_lane_s16(to + stride * 1, from, 1);
3095  vst1q_lane_s16(to + stride * 2, from, 2);
3096  vst1q_lane_s16(to + stride * 3, from, 3);
3097  vst1q_lane_s16(to + stride * 4, from, 4);
3098  vst1q_lane_s16(to + stride * 5, from, 5);
3099  vst1q_lane_s16(to + stride * 6, from, 6);
3100  vst1q_lane_s16(to + stride * 7, from, 7);
3101 }
3102 template <>
3104  Index stride) {
3105  vst1_lane_u16(to + stride * 0, from, 0);
3106  vst1_lane_u16(to + stride * 1, from, 1);
3107  vst1_lane_u16(to + stride * 2, from, 2);
3108  vst1_lane_u16(to + stride * 3, from, 3);
3109 }
3110 template <>
3112  Index stride) {
3113  vst1q_lane_u16(to + stride * 0, from, 0);
3114  vst1q_lane_u16(to + stride * 1, from, 1);
3115  vst1q_lane_u16(to + stride * 2, from, 2);
3116  vst1q_lane_u16(to + stride * 3, from, 3);
3117  vst1q_lane_u16(to + stride * 4, from, 4);
3118  vst1q_lane_u16(to + stride * 5, from, 5);
3119  vst1q_lane_u16(to + stride * 6, from, 6);
3120  vst1q_lane_u16(to + stride * 7, from, 7);
3121 }
3122 template <>
3124  Index stride) {
3125  vst1_lane_s32(to + stride * 0, from, 0);
3126  vst1_lane_s32(to + stride * 1, from, 1);
3127 }
3128 template <>
3130  Index stride) {
3131  vst1q_lane_s32(to + stride * 0, from, 0);
3132  vst1q_lane_s32(to + stride * 1, from, 1);
3133  vst1q_lane_s32(to + stride * 2, from, 2);
3134  vst1q_lane_s32(to + stride * 3, from, 3);
3135 }
3136 template <>
3138  Index stride) {
3139  vst1_lane_u32(to + stride * 0, from, 0);
3140  vst1_lane_u32(to + stride * 1, from, 1);
3141 }
3142 template <>
3144  Index stride) {
3145  vst1q_lane_u32(to + stride * 0, from, 0);
3146  vst1q_lane_u32(to + stride * 1, from, 1);
3147  vst1q_lane_u32(to + stride * 2, from, 2);
3148  vst1q_lane_u32(to + stride * 3, from, 3);
3149 }
3150 template <>
3152  Index stride) {
3153  vst1q_lane_s64(to + stride * 0, from, 0);
3154  vst1q_lane_s64(to + stride * 1, from, 1);
3155 }
3156 template <>
3158  Index stride) {
3159  vst1q_lane_u64(to + stride * 0, from, 0);
3160  vst1q_lane_u64(to + stride * 1, from, 1);
3161 }
3162 
3163 template <>
3164 EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
3165  EIGEN_ARM_PREFETCH(addr);
3166 }
3167 template <>
3168 EIGEN_STRONG_INLINE void prefetch<int8_t>(const int8_t* addr) {
3169  EIGEN_ARM_PREFETCH(addr);
3170 }
3171 template <>
3172 EIGEN_STRONG_INLINE void prefetch<uint8_t>(const uint8_t* addr) {
3173  EIGEN_ARM_PREFETCH(addr);
3174 }
3175 template <>
3176 EIGEN_STRONG_INLINE void prefetch<int16_t>(const int16_t* addr) {
3177  EIGEN_ARM_PREFETCH(addr);
3178 }
3179 template <>
3181  EIGEN_ARM_PREFETCH(addr);
3182 }
3183 template <>
3184 EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) {
3185  EIGEN_ARM_PREFETCH(addr);
3186 }
3187 template <>
3189  EIGEN_ARM_PREFETCH(addr);
3190 }
3191 template <>
3192 EIGEN_STRONG_INLINE void prefetch<int64_t>(const int64_t* addr) {
3193  EIGEN_ARM_PREFETCH(addr);
3194 }
3195 template <>
3197  EIGEN_ARM_PREFETCH(addr);
3198 }
3199 
3200 template <>
3202  return vget_lane_f32(a, 0);
3203 }
3204 template <>
3206  return vgetq_lane_f32(a, 0);
3207 }
3208 template <>
3210  return static_cast<int8_t>(a & 0xff);
3211 }
3212 template <>
3214  return vget_lane_s8(a, 0);
3215 }
3216 template <>
3218  return vgetq_lane_s8(a, 0);
3219 }
3220 template <>
3222  return static_cast<uint8_t>(a & 0xff);
3223 }
3224 template <>
3226  return vget_lane_u8(a, 0);
3227 }
3228 template <>
3230  return vgetq_lane_u8(a, 0);
3231 }
3232 template <>
3234  return vget_lane_s16(a, 0);
3235 }
3236 template <>
3238  return vgetq_lane_s16(a, 0);
3239 }
3240 template <>
3242  return vget_lane_u16(a, 0);
3243 }
3244 template <>
3246  return vgetq_lane_u16(a, 0);
3247 }
3248 template <>
3250  return vget_lane_s32(a, 0);
3251 }
3252 template <>
3254  return vgetq_lane_s32(a, 0);
3255 }
3256 template <>
3258  return vget_lane_u32(a, 0);
3259 }
3260 template <>
3262  return vgetq_lane_u32(a, 0);
3263 }
3264 template <>
3266  return vgetq_lane_s64(a, 0);
3267 }
3268 template <>
3270  return vgetq_lane_u64(a, 0);
3271 }
3272 
3273 template <>
3275  return vrev64_f32(a);
3276 }
3277 template <>
3279  const float32x4_t a_r64 = vrev64q_f32(a);
3280  return vcombine_f32(vget_high_f32(a_r64), vget_low_f32(a_r64));
3281 }
3282 template <>
3284  return vget_lane_s32(vreinterpret_s32_s8(vrev64_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0);
3285 }
3286 template <>
3288  return vrev64_s8(a);
3289 }
3290 template <>
3292  const int8x16_t a_r64 = vrev64q_s8(a);
3293  return vcombine_s8(vget_high_s8(a_r64), vget_low_s8(a_r64));
3294 }
3295 template <>
3297  return vget_lane_u32(vreinterpret_u32_u8(vrev64_u8(vreinterpret_u8_u32(vdup_n_u32(a)))), 0);
3298 }
3299 template <>
3301  return vrev64_u8(a);
3302 }
3303 template <>
3305  const uint8x16_t a_r64 = vrev64q_u8(a);
3306  return vcombine_u8(vget_high_u8(a_r64), vget_low_u8(a_r64));
3307 }
3308 template <>
3310  return vrev64_s16(a);
3311 }
3312 template <>
3314  const int16x8_t a_r64 = vrev64q_s16(a);
3315  return vcombine_s16(vget_high_s16(a_r64), vget_low_s16(a_r64));
3316 }
3317 template <>
3319  return vrev64_u16(a);
3320 }
3321 template <>
3323  const uint16x8_t a_r64 = vrev64q_u16(a);
3324  return vcombine_u16(vget_high_u16(a_r64), vget_low_u16(a_r64));
3325 }
3326 template <>
3328  return vrev64_s32(a);
3329 }
3330 template <>
3332  const int32x4_t a_r64 = vrev64q_s32(a);
3333  return vcombine_s32(vget_high_s32(a_r64), vget_low_s32(a_r64));
3334 }
3335 template <>
3337  return vrev64_u32(a);
3338 }
3339 template <>
3341  const uint32x4_t a_r64 = vrev64q_u32(a);
3342  return vcombine_u32(vget_high_u32(a_r64), vget_low_u32(a_r64));
3343 }
3344 template <>
3346  return vcombine_s64(vget_high_s64(a), vget_low_s64(a));
3347 }
3348 template <>
3350  return vcombine_u64(vget_high_u64(a), vget_low_u64(a));
3351 }
3352 
3353 template <>
3355  return vabs_f32(a);
3356 }
3357 template <>
3359  return vabsq_f32(a);
3360 }
3361 template <>
3363  return vget_lane_s32(vreinterpret_s32_s8(vabs_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0);
3364 }
3365 template <>
3367  return vabs_s8(a);
3368 }
3369 template <>
3371  return vabsq_s8(a);
3372 }
3373 template <>
3375  return a;
3376 }
3377 template <>
3379  return a;
3380 }
3381 template <>
3383  return a;
3384 }
3385 template <>
3387  return vabs_s16(a);
3388 }
3389 template <>
3391  return vabsq_s16(a);
3392 }
3393 template <>
3395  return a;
3396 }
3397 template <>
3399  return a;
3400 }
3401 template <>
3403  return vabs_s32(a);
3404 }
3405 template <>
3407  return vabsq_s32(a);
3408 }
3409 template <>
3411  return a;
3412 }
3413 template <>
3415  return a;
3416 }
3417 template <>
3419 #if EIGEN_ARCH_ARM64
3420  return vabsq_s64(a);
3421 #else
3422  return vcombine_s64(vdup_n_s64((std::abs)(vgetq_lane_s64(a, 0))), vdup_n_s64((std::abs)(vgetq_lane_s64(a, 1))));
3423 #endif
3424 }
3425 template <>
3427  return a;
3428 }
3429 
3430 template <>
3432  return vreinterpret_f32_s32(vshr_n_s32(vreinterpret_s32_f32(a), 31));
3433 }
3434 template <>
3436  return vreinterpretq_f32_s32(vshrq_n_s32(vreinterpretq_s32_f32(a), 31));
3437 }
3438 
3439 template <>
3441  return pfrexp_generic(a, exponent);
3442 }
3443 template <>
3445  return pfrexp_generic(a, exponent);
3446 }
3447 
3448 template <>
3450  return pldexp_generic(a, exponent);
3451 }
3452 template <>
3454  return pldexp_generic(a, exponent);
3455 }
3456 
3457 #if EIGEN_ARCH_ARM64
3458 template <>
3460  return vaddv_f32(a);
3461 }
3462 template <>
3464  return vaddvq_f32(a);
3465 }
3466 #else
3467 template <>
3469  return vget_lane_f32(vpadd_f32(a, a), 0);
3470 }
3471 template <>
3473  const float32x2_t sum = vadd_f32(vget_low_f32(a), vget_high_f32(a));
3474  return vget_lane_f32(vpadd_f32(sum, sum), 0);
3475 }
3476 #endif
3477 template <>
3479  const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
3480  int8x8_t sum = vpadd_s8(a_dup, a_dup);
3481  sum = vpadd_s8(sum, sum);
3482  return vget_lane_s8(sum, 0);
3483 }
3484 #if EIGEN_ARCH_ARM64
3485 template <>
3487  return vaddv_s8(a);
3488 }
3489 template <>
3491  return vaddvq_s8(a);
3492 }
3493 #else
3494 template <>
3496  int8x8_t sum = vpadd_s8(a, a);
3497  sum = vpadd_s8(sum, sum);
3498  sum = vpadd_s8(sum, sum);
3499  return vget_lane_s8(sum, 0);
3500 }
3501 template <>
3503  int8x8_t sum = vadd_s8(vget_low_s8(a), vget_high_s8(a));
3504  sum = vpadd_s8(sum, sum);
3505  sum = vpadd_s8(sum, sum);
3506  sum = vpadd_s8(sum, sum);
3507  return vget_lane_s8(sum, 0);
3508 }
3509 #endif
3510 template <>
3512  const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
3513  uint8x8_t sum = vpadd_u8(a_dup, a_dup);
3514  sum = vpadd_u8(sum, sum);
3515  return vget_lane_u8(sum, 0);
3516 }
3517 #if EIGEN_ARCH_ARM64
3518 template <>
3520  return vaddv_u8(a);
3521 }
3522 template <>
3524  return vaddvq_u8(a);
3525 }
3526 template <>
3528  return vaddv_s16(a);
3529 }
3530 template <>
3532  return vaddvq_s16(a);
3533 }
3534 template <>
3536  return vaddv_u16(a);
3537 }
3538 template <>
3540  return vaddvq_u16(a);
3541 }
3542 template <>
3544  return vaddv_s32(a);
3545 }
3546 template <>
3548  return vaddvq_s32(a);
3549 }
3550 template <>
3552  return vaddv_u32(a);
3553 }
3554 template <>
3556  return vaddvq_u32(a);
3557 }
3558 template <>
3560  return vaddvq_s64(a);
3561 }
3562 template <>
3564  return vaddvq_u64(a);
3565 }
3566 #else
3567 template <>
3569  uint8x8_t sum = vpadd_u8(a, a);
3570  sum = vpadd_u8(sum, sum);
3571  sum = vpadd_u8(sum, sum);
3572  return vget_lane_u8(sum, 0);
3573 }
3574 template <>
3576  uint8x8_t sum = vadd_u8(vget_low_u8(a), vget_high_u8(a));
3577  sum = vpadd_u8(sum, sum);
3578  sum = vpadd_u8(sum, sum);
3579  sum = vpadd_u8(sum, sum);
3580  return vget_lane_u8(sum, 0);
3581 }
3582 template <>
3584  const int16x4_t sum = vpadd_s16(a, a);
3585  return vget_lane_s16(vpadd_s16(sum, sum), 0);
3586 }
3587 template <>
3589  int16x4_t sum = vadd_s16(vget_low_s16(a), vget_high_s16(a));
3590  sum = vpadd_s16(sum, sum);
3591  sum = vpadd_s16(sum, sum);
3592  return vget_lane_s16(sum, 0);
3593 }
3594 template <>
3596  const uint16x4_t sum = vpadd_u16(a, a);
3597  return vget_lane_u16(vpadd_u16(sum, sum), 0);
3598 }
3599 template <>
3601  uint16x4_t sum = vadd_u16(vget_low_u16(a), vget_high_u16(a));
3602  sum = vpadd_u16(sum, sum);
3603  sum = vpadd_u16(sum, sum);
3604  return vget_lane_u16(sum, 0);
3605 }
3606 template <>
3608  return vget_lane_s32(vpadd_s32(a, a), 0);
3609 }
3610 template <>
3612  const int32x2_t sum = vadd_s32(vget_low_s32(a), vget_high_s32(a));
3613  return vget_lane_s32(vpadd_s32(sum, sum), 0);
3614 }
3615 template <>
3617  return vget_lane_u32(vpadd_u32(a, a), 0);
3618 }
3619 template <>
3621  const uint32x2_t sum = vadd_u32(vget_low_u32(a), vget_high_u32(a));
3622  return vget_lane_u32(vpadd_u32(sum, sum), 0);
3623 }
3624 template <>
3626  return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1);
3627 }
3628 template <>
3630  return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1);
3631 }
3632 #endif
3633 
3634 template <>
3636  return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(a, vreinterpret_s8_s32(vrev64_s32(vreinterpret_s32_s8(a))))), 0);
3637 }
3638 template <>
3640  return vadd_s8(vget_high_s8(a), vget_low_s8(a));
3641 }
3642 template <>
3644  return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(a, vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(a))))), 0);
3645 }
3646 template <>
3648  return vadd_u8(vget_high_u8(a), vget_low_u8(a));
3649 }
3650 template <>
3652  return vadd_s16(vget_high_s16(a), vget_low_s16(a));
3653 }
3654 template <>
3656  return vadd_u16(vget_high_u16(a), vget_low_u16(a));
3657 }
3658 
3659 // Other reduction functions:
3660 // mul
3661 template <>
3663  return vget_lane_f32(a, 0) * vget_lane_f32(a, 1);
3664 }
3665 template <>
3667  return predux_mul<Packet2f>(vmul_f32(vget_low_f32(a), vget_high_f32(a)));
3668 }
3669 template <>
3671  int8x8_t prod = vreinterpret_s8_s32(vdup_n_s32(a));
3672  prod = vmul_s8(prod, vrev16_s8(prod));
3673  return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 2);
3674 }
3675 template <>
3677  int8x8_t prod = vmul_s8(a, vrev16_s8(a));
3678  prod = vmul_s8(prod, vrev32_s8(prod));
3679  return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 4);
3680 }
3681 template <>
3683  return predux_mul<Packet8c>(vmul_s8(vget_low_s8(a), vget_high_s8(a)));
3684 }
3685 template <>
3687  uint8x8_t prod = vreinterpret_u8_u32(vdup_n_u32(a));
3688  prod = vmul_u8(prod, vrev16_u8(prod));
3689  return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 2);
3690 }
3691 template <>
3693  uint8x8_t prod = vmul_u8(a, vrev16_u8(a));
3694  prod = vmul_u8(prod, vrev32_u8(prod));
3695  return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 4);
3696 }
3697 template <>
3699  return predux_mul<Packet8uc>(vmul_u8(vget_low_u8(a), vget_high_u8(a)));
3700 }
3701 template <>
3703  const int16x4_t prod = vmul_s16(a, vrev32_s16(a));
3704  return vget_lane_s16(prod, 0) * vget_lane_s16(prod, 2);
3705 }
3706 template <>
3708  int16x4_t prod;
3709 
3710  // Get the product of a_lo * a_hi -> |a1*a5|a2*a6|a3*a7|a4*a8|
3711  prod = vmul_s16(vget_low_s16(a), vget_high_s16(a));
3712  // Swap and multiply |a1*a5*a2*a6|a3*a7*a4*a8|
3713  prod = vmul_s16(prod, vrev32_s16(prod));
3714  // Multiply |a1*a5*a2*a6*a3*a7*a4*a8|
3715  return vget_lane_s16(prod, 0) * vget_lane_s16(prod, 2);
3716 }
3717 template <>
3719  const uint16x4_t prod = vmul_u16(a, vrev32_u16(a));
3720  return vget_lane_u16(prod, 0) * vget_lane_u16(prod, 2);
3721 }
3722 template <>
3724  uint16x4_t prod;
3725 
3726  // Get the product of a_lo * a_hi -> |a1*a5|a2*a6|a3*a7|a4*a8|
3727  prod = vmul_u16(vget_low_u16(a), vget_high_u16(a));
3728  // Swap and multiply |a1*a5*a2*a6|a3*a7*a4*a8|
3729  prod = vmul_u16(prod, vrev32_u16(prod));
3730  // Multiply |a1*a5*a2*a6*a3*a7*a4*a8|
3731  return vget_lane_u16(prod, 0) * vget_lane_u16(prod, 2);
3732 }
3733 template <>
3735  return vget_lane_s32(a, 0) * vget_lane_s32(a, 1);
3736 }
3737 template <>
3739  return predux_mul<Packet2i>(vmul_s32(vget_low_s32(a), vget_high_s32(a)));
3740 }
3741 template <>
3743  return vget_lane_u32(a, 0) * vget_lane_u32(a, 1);
3744 }
3745 template <>
3747  return predux_mul<Packet2ui>(vmul_u32(vget_low_u32(a), vget_high_u32(a)));
3748 }
3749 template <>
3751  return vgetq_lane_s64(a, 0) * vgetq_lane_s64(a, 1);
3752 }
3753 template <>
3755  return vgetq_lane_u64(a, 0) * vgetq_lane_u64(a, 1);
3756 }
3757 
3758 // min
3759 #if EIGEN_ARCH_ARM64
3760 template <>
3762  return vminv_f32(a);
3763 }
3764 template <>
3766  return vminvq_f32(a);
3767 }
3768 #else
3769 template <>
3771  return vget_lane_f32(vpmin_f32(a, a), 0);
3772 }
3773 template <>
3775  const float32x2_t min = vmin_f32(vget_low_f32(a), vget_high_f32(a));
3776  return vget_lane_f32(vpmin_f32(min, min), 0);
3777 }
3778 #endif
3779 template <>
3781  const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
3782  int8x8_t min = vpmin_s8(a_dup, a_dup);
3783  min = vpmin_s8(min, min);
3784  return vget_lane_s8(min, 0);
3785 }
3786 #if EIGEN_ARCH_ARM64
3787 template <>
3789  return vminv_s8(a);
3790 }
3791 template <>
3793  return vminvq_s8(a);
3794 }
3795 #else
3796 template <>
3798  int8x8_t min = vpmin_s8(a, a);
3799  min = vpmin_s8(min, min);
3800  min = vpmin_s8(min, min);
3801  return vget_lane_s8(min, 0);
3802 }
3803 template <>
3805  int8x8_t min = vmin_s8(vget_low_s8(a), vget_high_s8(a));
3806  min = vpmin_s8(min, min);
3807  min = vpmin_s8(min, min);
3808  min = vpmin_s8(min, min);
3809  return vget_lane_s8(min, 0);
3810 }
3811 #endif
3812 template <>
3814  const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
3815  uint8x8_t min = vpmin_u8(a_dup, a_dup);
3816  min = vpmin_u8(min, min);
3817  return vget_lane_u8(min, 0);
3818 }
3819 #if EIGEN_ARCH_ARM64
3820 template <>
3822  return vminv_u8(a);
3823 }
3824 template <>
3826  return vminvq_u8(a);
3827 }
3828 template <>
3830  return vminv_s16(a);
3831 }
3832 template <>
3834  return vminvq_s16(a);
3835 }
3836 template <>
3838  return vminv_u16(a);
3839 }
3840 template <>
3842  return vminvq_u16(a);
3843 }
3844 template <>
3846  return vminv_s32(a);
3847 }
3848 template <>
3850  return vminvq_s32(a);
3851 }
3852 template <>
3854  return vminv_u32(a);
3855 }
3856 template <>
3858  return vminvq_u32(a);
3859 }
3860 #else
3861 template <>
3863  uint8x8_t min = vpmin_u8(a, a);
3864  min = vpmin_u8(min, min);
3865  min = vpmin_u8(min, min);
3866  return vget_lane_u8(min, 0);
3867 }
3868 template <>
3870  uint8x8_t min = vmin_u8(vget_low_u8(a), vget_high_u8(a));
3871  min = vpmin_u8(min, min);
3872  min = vpmin_u8(min, min);
3873  min = vpmin_u8(min, min);
3874  return vget_lane_u8(min, 0);
3875 }
3876 template <>
3878  const int16x4_t min = vpmin_s16(a, a);
3879  return vget_lane_s16(vpmin_s16(min, min), 0);
3880 }
3881 template <>
3883  int16x4_t min = vmin_s16(vget_low_s16(a), vget_high_s16(a));
3884  min = vpmin_s16(min, min);
3885  min = vpmin_s16(min, min);
3886  return vget_lane_s16(min, 0);
3887 }
3888 template <>
3890  const uint16x4_t min = vpmin_u16(a, a);
3891  return vget_lane_u16(vpmin_u16(min, min), 0);
3892 }
3893 template <>
3895  uint16x4_t min = vmin_u16(vget_low_u16(a), vget_high_u16(a));
3896  min = vpmin_u16(min, min);
3897  min = vpmin_u16(min, min);
3898  return vget_lane_u16(min, 0);
3899 }
3900 template <>
3902  return vget_lane_s32(vpmin_s32(a, a), 0);
3903 }
3904 template <>
3906  const int32x2_t min = vmin_s32(vget_low_s32(a), vget_high_s32(a));
3907  return vget_lane_s32(vpmin_s32(min, min), 0);
3908 }
3909 template <>
3911  return vget_lane_u32(vpmin_u32(a, a), 0);
3912 }
3913 template <>
3915  const uint32x2_t min = vmin_u32(vget_low_u32(a), vget_high_u32(a));
3916  return vget_lane_u32(vpmin_u32(min, min), 0);
3917 }
3918 #endif
3919 template <>
3921  return (std::min)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1));
3922 }
3923 template <>
3925  return (std::min)(vgetq_lane_u64(a, 0), vgetq_lane_u64(a, 1));
3926 }
3927 
3928 // max
3929 #if EIGEN_ARCH_ARM64
3930 template <>
3932  return vmaxv_f32(a);
3933 }
3934 template <>
3936  return vmaxvq_f32(a);
3937 }
3938 #else
3939 template <>
3941  return vget_lane_f32(vpmax_f32(a, a), 0);
3942 }
3943 template <>
3945  const float32x2_t max = vmax_f32(vget_low_f32(a), vget_high_f32(a));
3946  return vget_lane_f32(vpmax_f32(max, max), 0);
3947 }
3948 #endif
3949 template <>
3951  const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
3952  int8x8_t max = vpmax_s8(a_dup, a_dup);
3953  max = vpmax_s8(max, max);
3954  return vget_lane_s8(max, 0);
3955 }
3956 #if EIGEN_ARCH_ARM64
3957 template <>
3959  return vmaxv_s8(a);
3960 }
3961 template <>
3963  return vmaxvq_s8(a);
3964 }
3965 #else
3966 template <>
3968  int8x8_t max = vpmax_s8(a, a);
3969  max = vpmax_s8(max, max);
3970  max = vpmax_s8(max, max);
3971  return vget_lane_s8(max, 0);
3972 }
3973 template <>
3975  int8x8_t max = vmax_s8(vget_low_s8(a), vget_high_s8(a));
3976  max = vpmax_s8(max, max);
3977  max = vpmax_s8(max, max);
3978  max = vpmax_s8(max, max);
3979  return vget_lane_s8(max, 0);
3980 }
3981 #endif
3982 template <>
3984  const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
3985  uint8x8_t max = vpmax_u8(a_dup, a_dup);
3986  max = vpmax_u8(max, max);
3987  return vget_lane_u8(max, 0);
3988 }
3989 #if EIGEN_ARCH_ARM64
3990 template <>
3992  return vmaxv_u8(a);
3993 }
3994 template <>
3996  return vmaxvq_u8(a);
3997 }
3998 template <>
4000  return vmaxv_s16(a);
4001 }
4002 template <>
4004  return vmaxvq_s16(a);
4005 }
4006 template <>
4008  return vmaxv_u16(a);
4009 }
4010 template <>
4012  return vmaxvq_u16(a);
4013 }
4014 template <>
4016  return vmaxv_s32(a);
4017 }
4018 template <>
4020  return vmaxvq_s32(a);
4021 }
4022 template <>
4024  return vmaxv_u32(a);
4025 }
4026 template <>
4028  return vmaxvq_u32(a);
4029 }
4030 #else
4031 template <>
4033  uint8x8_t max = vpmax_u8(a, a);
4034  max = vpmax_u8(max, max);
4035  max = vpmax_u8(max, max);
4036  return vget_lane_u8(max, 0);
4037 }
4038 template <>
4040  uint8x8_t max = vmax_u8(vget_low_u8(a), vget_high_u8(a));
4041  max = vpmax_u8(max, max);
4042  max = vpmax_u8(max, max);
4043  max = vpmax_u8(max, max);
4044  return vget_lane_u8(max, 0);
4045 }
4046 template <>
4048  const int16x4_t max = vpmax_s16(a, a);
4049  return vget_lane_s16(vpmax_s16(max, max), 0);
4050 }
4051 template <>
4053  int16x4_t max = vmax_s16(vget_low_s16(a), vget_high_s16(a));
4054  max = vpmax_s16(max, max);
4055  max = vpmax_s16(max, max);
4056  return vget_lane_s16(max, 0);
4057 }
4058 template <>
4060  const uint16x4_t max = vpmax_u16(a, a);
4061  return vget_lane_u16(vpmax_u16(max, max), 0);
4062 }
4063 template <>
4065  uint16x4_t max = vmax_u16(vget_low_u16(a), vget_high_u16(a));
4066  max = vpmax_u16(max, max);
4067  max = vpmax_u16(max, max);
4068  return vget_lane_u16(max, 0);
4069 }
4070 template <>
4072  return vget_lane_s32(vpmax_s32(a, a), 0);
4073 }
4074 template <>
4076  const int32x2_t max = vmax_s32(vget_low_s32(a), vget_high_s32(a));
4077  return vget_lane_s32(vpmax_s32(max, max), 0);
4078 }
4079 template <>
4081  return vget_lane_u32(vpmax_u32(a, a), 0);
4082 }
4083 template <>
4085  const uint32x2_t max = vmax_u32(vget_low_u32(a), vget_high_u32(a));
4086  return vget_lane_u32(vpmax_u32(max, max), 0);
4087 }
4088 #endif
4089 template <>
4091  return (std::max)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1));
4092 }
4093 template <>
4095  return (std::max)(vgetq_lane_u64(a, 0), vgetq_lane_u64(a, 1));
4096 }
4097 
4098 template <>
4100  uint32x2_t tmp = vorr_u32(vget_low_u32(vreinterpretq_u32_f32(x)), vget_high_u32(vreinterpretq_u32_f32(x)));
4101  return vget_lane_u32(vpmax_u32(tmp, tmp), 0);
4102 }
4103 
4104 // Helpers for ptranspose.
4105 namespace detail {
4106 
4107 template <typename Packet>
4109 
4110 template <>
4112  const float32x2x2_t tmp = vzip_f32(p1, p2);
4113  p1 = tmp.val[0];
4114  p2 = tmp.val[1];
4115 }
4116 
4117 template <>
4119  const float32x4x2_t tmp = vzipq_f32(p1, p2);
4120  p1 = tmp.val[0];
4121  p2 = tmp.val[1];
4122 }
4123 
4124 template <>
4126  const int8x8x2_t tmp = vzip_s8(p1, p2);
4127  p1 = tmp.val[0];
4128  p2 = tmp.val[1];
4129 }
4130 
4131 template <>
4133  const int8x16x2_t tmp = vzipq_s8(p1, p2);
4134  p1 = tmp.val[0];
4135  p2 = tmp.val[1];
4136 }
4137 
4138 template <>
4140  const uint8x8x2_t tmp = vzip_u8(p1, p2);
4141  p1 = tmp.val[0];
4142  p2 = tmp.val[1];
4143 }
4144 
4145 template <>
4147  const uint8x16x2_t tmp = vzipq_u8(p1, p2);
4148  p1 = tmp.val[0];
4149  p2 = tmp.val[1];
4150 }
4151 
4152 template <>
4154  const int32x2x2_t tmp = vzip_s32(p1, p2);
4155  p1 = tmp.val[0];
4156  p2 = tmp.val[1];
4157 }
4158 
4159 template <>
4161  const int32x4x2_t tmp = vzipq_s32(p1, p2);
4162  p1 = tmp.val[0];
4163  p2 = tmp.val[1];
4164 }
4165 
4166 template <>
4168  const uint32x2x2_t tmp = vzip_u32(p1, p2);
4169  p1 = tmp.val[0];
4170  p2 = tmp.val[1];
4171 }
4172 
4173 template <>
4175  const uint32x4x2_t tmp = vzipq_u32(p1, p2);
4176  p1 = tmp.val[0];
4177  p2 = tmp.val[1];
4178 }
4179 
4180 template <>
4182  const int16x4x2_t tmp = vzip_s16(p1, p2);
4183  p1 = tmp.val[0];
4184  p2 = tmp.val[1];
4185 }
4186 
4187 template <>
4189  const int16x8x2_t tmp = vzipq_s16(p1, p2);
4190  p1 = tmp.val[0];
4191  p2 = tmp.val[1];
4192 }
4193 
4194 template <>
4196  const uint16x4x2_t tmp = vzip_u16(p1, p2);
4197  p1 = tmp.val[0];
4198  p2 = tmp.val[1];
4199 }
4200 
4201 template <>
4203  const uint16x8x2_t tmp = vzipq_u16(p1, p2);
4204  p1 = tmp.val[0];
4205  p2 = tmp.val[1];
4206 }
4207 
4208 template <typename Packet>
4210  zip_in_place(kernel.packet[0], kernel.packet[1]);
4211 }
4212 
4213 template <typename Packet>
4215  zip_in_place(kernel.packet[0], kernel.packet[2]);
4216  zip_in_place(kernel.packet[1], kernel.packet[3]);
4217  zip_in_place(kernel.packet[0], kernel.packet[1]);
4218  zip_in_place(kernel.packet[2], kernel.packet[3]);
4219 }
4220 
4221 template <typename Packet>
4223  zip_in_place(kernel.packet[0], kernel.packet[4]);
4224  zip_in_place(kernel.packet[1], kernel.packet[5]);
4225  zip_in_place(kernel.packet[2], kernel.packet[6]);
4226  zip_in_place(kernel.packet[3], kernel.packet[7]);
4227 
4228  zip_in_place(kernel.packet[0], kernel.packet[2]);
4229  zip_in_place(kernel.packet[1], kernel.packet[3]);
4230  zip_in_place(kernel.packet[4], kernel.packet[6]);
4231  zip_in_place(kernel.packet[5], kernel.packet[7]);
4232 
4233  zip_in_place(kernel.packet[0], kernel.packet[1]);
4234  zip_in_place(kernel.packet[2], kernel.packet[3]);
4235  zip_in_place(kernel.packet[4], kernel.packet[5]);
4236  zip_in_place(kernel.packet[6], kernel.packet[7]);
4237 }
4238 
4239 template <typename Packet>
4242  for (int i = 0; i < 4; ++i) {
4243  const int m = (1 << i);
4245  for (int j = 0; j < m; ++j) {
4246  const int n = (1 << (3 - i));
4248  for (int k = 0; k < n; ++k) {
4249  const int idx = 2 * j * n + k;
4250  zip_in_place(kernel.packet[idx], kernel.packet[idx + n]);
4251  }
4252  }
4253  }
4254 }
4255 
4256 } // namespace detail
4257 
4259  detail::ptranspose_impl(kernel);
4260 }
4261 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
4262  detail::ptranspose_impl(kernel);
4263 }
4264 
4266  const int8x8_t a = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[2], vdup_n_s32(kernel.packet[0]), 1));
4267  const int8x8_t b = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[3], vdup_n_s32(kernel.packet[1]), 1));
4268 
4269  const int8x8x2_t zip8 = vzip_s8(a, b);
4270  const int16x4x2_t zip16 = vzip_s16(vreinterpret_s16_s8(zip8.val[0]), vreinterpret_s16_s8(zip8.val[1]));
4271 
4272  kernel.packet[0] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 0);
4273  kernel.packet[1] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 1);
4274  kernel.packet[2] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 0);
4275  kernel.packet[3] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 1);
4276 }
4278  detail::ptranspose_impl(kernel);
4279 }
4281  detail::ptranspose_impl(kernel);
4282 }
4283 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 16>& kernel) {
4284  detail::ptranspose_impl(kernel);
4285 }
4286 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 8>& kernel) {
4287  detail::ptranspose_impl(kernel);
4288 }
4289 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 4>& kernel) {
4290  detail::ptranspose_impl(kernel);
4291 }
4292 
4294  const uint8x8_t a = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[2], vdup_n_u32(kernel.packet[0]), 1));
4295  const uint8x8_t b = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[3], vdup_n_u32(kernel.packet[1]), 1));
4296 
4297  const uint8x8x2_t zip8 = vzip_u8(a, b);
4298  const uint16x4x2_t zip16 = vzip_u16(vreinterpret_u16_u8(zip8.val[0]), vreinterpret_u16_u8(zip8.val[1]));
4299 
4300  kernel.packet[0] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 0);
4301  kernel.packet[1] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 1);
4302  kernel.packet[2] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 0);
4303  kernel.packet[3] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 1);
4304 }
4306  detail::ptranspose_impl(kernel);
4307 }
4309  detail::ptranspose_impl(kernel);
4310 }
4311 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 16>& kernel) {
4312  detail::ptranspose_impl(kernel);
4313 }
4314 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 8>& kernel) {
4315  detail::ptranspose_impl(kernel);
4316 }
4317 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 4>& kernel) {
4318  detail::ptranspose_impl(kernel);
4319 }
4320 
4322  detail::ptranspose_impl(kernel);
4323 }
4324 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8s, 8>& kernel) {
4325  detail::ptranspose_impl(kernel);
4326 }
4327 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8s, 4>& kernel) {
4328  detail::ptranspose_impl(kernel);
4329 }
4330 
4332  detail::ptranspose_impl(kernel);
4333 }
4334 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8us, 8>& kernel) {
4335  detail::ptranspose_impl(kernel);
4336 }
4337 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8us, 4>& kernel) {
4338  detail::ptranspose_impl(kernel);
4339 }
4340 
4342  detail::ptranspose_impl(kernel);
4343 }
4344 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
4345  detail::ptranspose_impl(kernel);
4346 }
4348  detail::zip_in_place(kernel.packet[0], kernel.packet[1]);
4349 }
4350 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4ui, 4>& kernel) {
4351  detail::ptranspose_impl(kernel);
4352 }
4353 
4354 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2l, 2>& kernel) {
4355 #if EIGEN_ARCH_ARM64
4356  const int64x2_t tmp1 = vzip1q_s64(kernel.packet[0], kernel.packet[1]);
4357  kernel.packet[1] = vzip2q_s64(kernel.packet[0], kernel.packet[1]);
4358  kernel.packet[0] = tmp1;
4359 #else
4360  const int64x1_t tmp[2][2] = {{vget_low_s64(kernel.packet[0]), vget_high_s64(kernel.packet[0])},
4361  {vget_low_s64(kernel.packet[1]), vget_high_s64(kernel.packet[1])}};
4362 
4363  kernel.packet[0] = vcombine_s64(tmp[0][0], tmp[1][0]);
4364  kernel.packet[1] = vcombine_s64(tmp[0][1], tmp[1][1]);
4365 #endif
4366 }
4367 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2ul, 2>& kernel) {
4368 #if EIGEN_ARCH_ARM64
4369  const uint64x2_t tmp1 = vzip1q_u64(kernel.packet[0], kernel.packet[1]);
4370  kernel.packet[1] = vzip2q_u64(kernel.packet[0], kernel.packet[1]);
4371  kernel.packet[0] = tmp1;
4372 #else
4373  const uint64x1_t tmp[2][2] = {{vget_low_u64(kernel.packet[0]), vget_high_u64(kernel.packet[0])},
4374  {vget_low_u64(kernel.packet[1]), vget_high_u64(kernel.packet[1])}};
4375 
4376  kernel.packet[0] = vcombine_u64(tmp[0][0], tmp[1][0]);
4377  kernel.packet[1] = vcombine_u64(tmp[0][1], tmp[1][1]);
4378 #endif
4379 }
4380 
4381 template <>
4383  return vbsl_f32(vreinterpret_u32_f32(mask), a, b);
4384 }
4385 template <>
4387  return vbslq_f32(vreinterpretq_u32_f32(mask), a, b);
4388 }
4389 template <>
4391  return vbsl_s8(vreinterpret_u8_s8(mask), a, b);
4392 }
4393 template <>
4395  return vbslq_s8(vreinterpretq_u8_s8(mask), a, b);
4396 }
4397 template <>
4399  return vbsl_u8(mask, a, b);
4400 }
4401 template <>
4403  const Packet16uc& b) {
4404  return vbslq_u8(mask, a, b);
4405 }
4406 template <>
4408  return vbsl_s16(vreinterpret_u16_s16(mask), a, b);
4409 }
4410 template <>
4412  return vbslq_s16(vreinterpretq_u16_s16(mask), a, b);
4413 }
4414 template <>
4416  return vbsl_u16(mask, a, b);
4417 }
4418 template <>
4420  return vbslq_u16(mask, a, b);
4421 }
4422 template <>
4424  return vbsl_s32(vreinterpret_u32_s32(mask), a, b);
4425 }
4426 template <>
4428  return vbslq_s32(vreinterpretq_u32_s32(mask), a, b);
4429 }
4430 template <>
4432  return vbsl_u32(mask, a, b);
4433 }
4434 template <>
4436  return vbslq_u32(mask, a, b);
4437 }
4438 template <>
4440  return vbslq_s64(vreinterpretq_u64_s64(mask), a, b);
4441 }
4442 template <>
4444  return vbslq_u64(mask, a, b);
4445 }
4446 
4447 // Use armv8 rounding intinsics if available.
4448 #if EIGEN_ARCH_ARMV8
4449 template <>
4450 EIGEN_STRONG_INLINE Packet2f print<Packet2f>(const Packet2f& a) {
4451  return vrndn_f32(a);
4452 }
4453 
4454 template <>
4456  return vrndnq_f32(a);
4457 }
4458 
4459 template <>
4460 EIGEN_STRONG_INLINE Packet2f pfloor<Packet2f>(const Packet2f& a) {
4461  return vrndm_f32(a);
4462 }
4463 
4464 template <>
4466  return vrndmq_f32(a);
4467 }
4468 
4469 template <>
4470 EIGEN_STRONG_INLINE Packet2f pceil<Packet2f>(const Packet2f& a) {
4471  return vrndp_f32(a);
4472 }
4473 
4474 template <>
4476  return vrndpq_f32(a);
4477 }
4478 
4479 template <>
4480 EIGEN_STRONG_INLINE Packet2f pround<Packet2f>(const Packet2f& a) {
4481  return vrnda_f32(a);
4482 }
4483 
4484 template <>
4486  return vrndaq_f32(a);
4487 }
4488 
4489 template <>
4490 EIGEN_STRONG_INLINE Packet2f ptrunc<Packet2f>(const Packet2f& a) {
4491  return vrnd_f32(a);
4492 }
4493 
4494 template <>
4496  return vrndq_f32(a);
4497 }
4498 #endif
4499 
4506 template <>
4508  uint8x8_t x = vreinterpret_u8_u32(vdup_n_u32(a));
4509  uint8x8_t res = vdup_n_u8(0);
4510  uint8x8_t add = vdup_n_u8(0x8);
4511  for (int i = 0; i < 4; i++) {
4512  const uint8x8_t temp = vorr_u8(res, add);
4513  res = vbsl_u8(vcge_u8(x, vmul_u8(temp, temp)), temp, res);
4514  add = vshr_n_u8(add, 1);
4515  }
4516  return vget_lane_u32(vreinterpret_u32_u8(res), 0);
4517 }
4519 template <>
4521  uint8x8_t res = vdup_n_u8(0);
4522  uint8x8_t add = vdup_n_u8(0x8);
4523  for (int i = 0; i < 4; i++) {
4524  const uint8x8_t temp = vorr_u8(res, add);
4525  res = vbsl_u8(vcge_u8(a, vmul_u8(temp, temp)), temp, res);
4526  add = vshr_n_u8(add, 1);
4527  }
4528  return res;
4529 }
4531 template <>
4533  uint8x16_t res = vdupq_n_u8(0);
4534  uint8x16_t add = vdupq_n_u8(0x8);
4535  for (int i = 0; i < 4; i++) {
4536  const uint8x16_t temp = vorrq_u8(res, add);
4537  res = vbslq_u8(vcgeq_u8(a, vmulq_u8(temp, temp)), temp, res);
4538  add = vshrq_n_u8(add, 1);
4539  }
4540  return res;
4541 }
4543 template <>
4545  uint16x4_t res = vdup_n_u16(0);
4546  uint16x4_t add = vdup_n_u16(0x80);
4547  for (int i = 0; i < 8; i++) {
4548  const uint16x4_t temp = vorr_u16(res, add);
4549  res = vbsl_u16(vcge_u16(a, vmul_u16(temp, temp)), temp, res);
4550  add = vshr_n_u16(add, 1);
4551  }
4552  return res;
4553 }
4555 template <>
4557  uint16x8_t res = vdupq_n_u16(0);
4558  uint16x8_t add = vdupq_n_u16(0x80);
4559  for (int i = 0; i < 8; i++) {
4560  const uint16x8_t temp = vorrq_u16(res, add);
4561  res = vbslq_u16(vcgeq_u16(a, vmulq_u16(temp, temp)), temp, res);
4562  add = vshrq_n_u16(add, 1);
4563  }
4564  return res;
4565 }
4567 template <>
4569  uint32x2_t res = vdup_n_u32(0);
4570  uint32x2_t add = vdup_n_u32(0x8000);
4571  for (int i = 0; i < 16; i++) {
4572  const uint32x2_t temp = vorr_u32(res, add);
4573  res = vbsl_u32(vcge_u32(a, vmul_u32(temp, temp)), temp, res);
4574  add = vshr_n_u32(add, 1);
4575  }
4576  return res;
4577 }
4579 template <>
4581  uint32x4_t res = vdupq_n_u32(0);
4582  uint32x4_t add = vdupq_n_u32(0x8000);
4583  for (int i = 0; i < 16; i++) {
4584  const uint32x4_t temp = vorrq_u32(res, add);
4585  res = vbslq_u32(vcgeq_u32(a, vmulq_u32(temp, temp)), temp, res);
4586  add = vshrq_n_u32(add, 1);
4587  }
4588  return res;
4589 }
4590 
4592  // Compute approximate reciprocal sqrt.
4593  // Does not correctly handle +/- 0 or +inf
4594  float32x4_t result = vrsqrteq_f32(a);
4595  result = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, result), result), result);
4596  result = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, result), result), result);
4597  return result;
4598 }
4599 
4601  // Compute approximate reciprocal sqrt.
4602  // Does not correctly handle +/- 0 or +inf
4603  float32x2_t result = vrsqrte_f32(a);
4604  result = vmul_f32(vrsqrts_f32(vmul_f32(a, result), result), result);
4605  result = vmul_f32(vrsqrts_f32(vmul_f32(a, result), result), result);
4606  return result;
4607 }
4608 
4609 template <typename Packet>
4611  const Packet cst_zero = pzero(a);
4612  const Packet cst_inf = pset1<Packet>(NumTraits<float>::infinity());
4613  Packet return_zero = pcmp_eq(a, cst_inf);
4614  Packet return_inf = pcmp_eq(a, cst_zero);
4615  Packet result = prsqrt_float_unsafe(a);
4616  result = pselect(return_inf, por(cst_inf, a), result);
4617  result = pandnot(result, return_zero);
4618  return result;
4619 }
4620 
4621 template <>
4623  return prsqrt_float_common(a);
4624 }
4625 
4626 template <>
4628  return prsqrt_float_common(a);
4629 }
4630 
4631 template <>
4633  // Compute approximate reciprocal.
4634  float32x4_t result = vrecpeq_f32(a);
4635  result = vmulq_f32(vrecpsq_f32(a, result), result);
4636  result = vmulq_f32(vrecpsq_f32(a, result), result);
4637  return result;
4638 }
4639 
4640 template <>
4642  // Compute approximate reciprocal.
4643  float32x2_t result = vrecpe_f32(a);
4644  result = vmul_f32(vrecps_f32(a, result), result);
4645  result = vmul_f32(vrecps_f32(a, result), result);
4646  return result;
4647 }
4648 
4649 // Unfortunately vsqrt_f32 is only available for A64.
4650 #if EIGEN_ARCH_ARM64
4651 template <>
4653  return vsqrtq_f32(a);
4654 }
4655 
4656 template <>
4658  return vsqrt_f32(a);
4659 }
4660 
4661 template <>
4663  return vdivq_f32(a, b);
4664 }
4665 
4666 template <>
4668  return vdiv_f32(a, b);
4669 }
4670 #else
4671 template <typename Packet>
4673  const Packet cst_zero = pzero(a);
4674  const Packet cst_inf = pset1<Packet>(NumTraits<float>::infinity());
4675 
4676  Packet result = pmul(a, prsqrt_float_unsafe(a));
4677  Packet a_is_zero = pcmp_eq(a, cst_zero);
4678  Packet a_is_inf = pcmp_eq(a, cst_inf);
4679  Packet return_a = por(a_is_zero, a_is_inf);
4680 
4681  result = pselect(return_a, a, result);
4682  return result;
4683 }
4684 
4685 template <>
4687  return psqrt_float_common(a);
4688 }
4689 
4690 template <>
4692  return psqrt_float_common(a);
4693 }
4694 
4695 template <typename Packet>
4697  // if b is large, NEON intrinsics will flush preciprocal(b) to zero
4698  // avoid underflow with the following manipulation:
4699  // a / b = f * (a * reciprocal(f * b))
4700 
4701  const Packet cst_one = pset1<Packet>(1.0f);
4702  const Packet cst_quarter = pset1<Packet>(0.25f);
4703  const Packet cst_thresh = pset1<Packet>(NumTraits<float>::highest() / 4.0f);
4704 
4705  Packet b_will_underflow = pcmp_le(cst_thresh, pabs(b));
4706  Packet f = pselect(b_will_underflow, cst_quarter, cst_one);
4707  Packet result = pmul(f, pmul(a, preciprocal(pmul(b, f))));
4708  return result;
4709 }
4710 
4711 template <>
4713  return pdiv_float_common(a, b);
4714 }
4715 
4716 template <>
4718  return pdiv_float_common(a, b);
4719 }
4720 #endif
4721 
4722 //---------- bfloat16 ----------
4723 // TODO: Add support for native armv8.6-a bfloat16_t
4724 
4725 // TODO: Guard if we have native bfloat16 support
4727 
4728 template <>
4730  enum { value = true };
4731 };
4732 
4733 template <>
4734 struct packet_traits<bfloat16> : default_packet_traits {
4735  typedef Packet4bf type;
4736  typedef Packet4bf half;
4737  enum {
4738  Vectorizable = 1,
4739  AlignedOnScalar = 1,
4740  size = 4,
4741 
4742  HasCmp = 1,
4743  HasAdd = 1,
4744  HasSub = 1,
4746  HasMul = 1,
4747  HasNegate = 1,
4748  HasAbs = 1,
4749  HasArg = 0,
4750  HasAbs2 = 1,
4752  HasMin = 1,
4753  HasMax = 1,
4754  HasConj = 1,
4755  HasSetLinear = 1,
4756  HasBlend = 0,
4757  HasDiv = 1,
4760  HasLog = 1,
4761  HasExp = 1,
4762  HasSqrt = 0,
4765  HasBessel = 0, // Issues with accuracy.
4766  HasNdtri = 0
4767  };
4768 };
4769 
4770 template <>
4772  typedef bfloat16 type;
4773  typedef Packet4bf half;
4774  enum {
4775  size = 4,
4779  masked_store_available = false
4780  };
4781 };
4782 
4783 namespace detail {
4784 template <>
4786  const uint16x4x2_t tmp = vzip_u16(p1, p2);
4787  p1 = tmp.val[0];
4788  p2 = tmp.val[1];
4789 }
4790 } // namespace detail
4791 
4793  // See the scalar implementation in BFloat16.h for a comprehensible explanation
4794  // of this fast rounding algorithm
4795  Packet4ui input = Packet4ui(vreinterpretq_u32_f32(p));
4796 
4797  // lsb = (input >> 16) & 1
4798  Packet4ui lsb = vandq_u32(vshrq_n_u32(input, 16), vdupq_n_u32(1));
4799 
4800  // rounding_bias = 0x7fff + lsb
4801  Packet4ui rounding_bias = vaddq_u32(lsb, vdupq_n_u32(0x7fff));
4802 
4803  // input += rounding_bias
4804  input = vaddq_u32(input, rounding_bias);
4805 
4806  // input = input >> 16
4807  input = vshrq_n_u32(input, 16);
4808 
4809  // Replace float-nans by bfloat16-nans, that is 0x7fc0
4810  const Packet4ui bf16_nan = vdupq_n_u32(0x7fc0);
4811  const Packet4ui mask = vceqq_f32(p, p);
4812  input = vbslq_u32(mask, input, bf16_nan);
4813 
4814  // output = static_cast<uint16_t>(input)
4815  return vmovn_u32(input);
4816 }
4817 
4819  return Packet4f(vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(p), 16)));
4820 }
4821 
4822 EIGEN_STRONG_INLINE Packet4bf F32MaskToBf16Mask(const Packet4f& p) { return vmovn_u32(vreinterpretq_u32_f32(p)); }
4823 
4824 template <>
4826  return Packet4bf(pset1<Packet4us>(from.value));
4827 }
4828 
4829 template <>
4832 }
4833 
4834 template <>
4836  return Packet4bf(pload<Packet4us>(reinterpret_cast<const uint16_t*>(from)));
4837 }
4838 
4839 template <>
4841  return Packet4bf(ploadu<Packet4us>(reinterpret_cast<const uint16_t*>(from)));
4842 }
4843 
4844 template <>
4846  EIGEN_DEBUG_ALIGNED_STORE vst1_u16(reinterpret_cast<uint16_t*>(to), from);
4847 }
4848 
4849 template <>
4851  EIGEN_DEBUG_UNALIGNED_STORE vst1_u16(reinterpret_cast<uint16_t*>(to), from);
4852 }
4853 
4854 template <>
4856  return Packet4bf(ploaddup<Packet4us>(reinterpret_cast<const uint16_t*>(from)));
4857 }
4858 
4859 template <>
4861  return F32ToBf16(pabs<Packet4f>(Bf16ToF32(a)));
4862 }
4863 
4864 template <>
4867 }
4868 template <>
4871 }
4872 
4873 template <>
4876 }
4877 
4878 template <>
4881 }
4882 template <>
4885 }
4886 
4887 template <>
4890 }
4891 
4892 template <>
4894  return F32ToBf16(plset<Packet4f>(static_cast<float>(a)));
4895 }
4896 
4897 template <>
4900 }
4901 
4902 template <>
4905 }
4906 
4907 template <>
4910 }
4911 
4912 template <>
4915 }
4916 
4917 template <>
4919  return Packet4bf(pselect<Packet4us>(Packet4us(mask), Packet4us(a), Packet4us(b)));
4920 }
4921 
4922 template <>
4925 }
4926 
4927 template <>
4930 }
4931 
4932 template <>
4935 }
4936 
4937 template <>
4940 }
4941 
4942 template <>
4945 }
4946 
4947 template <>
4949  return a;
4950 }
4951 
4952 template <>
4955 }
4956 
4957 template <>
4960 }
4961 
4962 template <>
4965 }
4966 
4967 template <>
4970 }
4971 
4972 template <>
4974  return Packet4bf(pgather<uint16_t, Packet4us>(reinterpret_cast<const uint16_t*>(from), stride));
4975 }
4976 
4977 template <>
4979  pscatter<uint16_t, Packet4us>(reinterpret_cast<uint16_t*>(to), Packet4us(from), stride);
4980 }
4981 
4982 template <>
4984  return static_cast<bfloat16>(predux<Packet4f>(Bf16ToF32(a)));
4985 }
4986 
4987 template <>
4989  return static_cast<bfloat16>(predux_max<Packet4f>(Bf16ToF32(a)));
4990 }
4991 
4992 template <>
4994  return static_cast<bfloat16>(predux_min<Packet4f>(Bf16ToF32(a)));
4995 }
4996 
4997 template <>
4999  return static_cast<bfloat16>(predux_mul<Packet4f>(Bf16ToF32(a)));
5000 }
5001 
5002 template <>
5004  return Packet4bf(preverse<Packet4us>(Packet4us(a)));
5005 }
5006 
5008  detail::ptranspose_impl(kernel);
5009 }
5010 
5011 template <>
5014 }
5015 
5016 template <>
5019 }
5020 
5021 template <>
5024 }
5025 
5026 template <>
5029 }
5030 
5031 template <>
5034 }
5035 
5036 template <>
5038  return Packet4bf(pxor<Packet4us>(Packet4us(a), pset1<Packet4us>(static_cast<uint16_t>(0x8000))));
5039 }
5040 
5041 //---------- double ----------
5042 
5043 // Clang 3.5 in the iOS toolchain has an ICE triggered by NEON intrinsics for double.
5044 // Confirmed at least with __apple_build_version__ = 6000054.
5045 #if EIGEN_COMP_CLANGAPPLE
5046 // Let's hope that by the time __apple_build_version__ hits the 601* range, the bug will be fixed.
5047 // https://gist.github.com/yamaya/2924292 suggests that the 3 first digits are only updated with
5048 // major toolchain updates.
5049 #define EIGEN_APPLE_DOUBLE_NEON_BUG (EIGEN_COMP_CLANGAPPLE < 6010000)
5050 #else
5051 #define EIGEN_APPLE_DOUBLE_NEON_BUG 0
5052 #endif
5053 
5054 #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
5055 
5056 #if EIGEN_COMP_GNUC
5057 // Bug 907: workaround missing declarations of the following two functions in the ADK
5058 // Defining these functions as templates ensures that if these intrinsics are
5059 // already defined in arm_neon.h, then our workaround doesn't cause a conflict
5060 // and has lower priority in overload resolution.
5061 // This doesn't work with MSVC though, since the function names are macros.
5062 template <typename T>
5063 uint64x2_t vreinterpretq_u64_f64(T a) {
5064  return (uint64x2_t)a;
5065 }
5066 
5067 template <typename T>
5068 float64x2_t vreinterpretq_f64_u64(T a) {
5069  return (float64x2_t)a;
5070 }
5071 #endif
5072 
5073 #if EIGEN_COMP_MSVC_STRICT
5074 typedef eigen_packet_wrapper<float64x2_t, 18> Packet2d;
5075 typedef eigen_packet_wrapper<float64x1_t, 19> Packet1d;
5076 
5077 EIGEN_ALWAYS_INLINE Packet2d make_packet2d(double a, double b) {
5078  double from[2] = {a, b};
5079  return vld1q_f64(from);
5080 }
5081 
5082 #else
5083 typedef float64x2_t Packet2d;
5084 typedef float64x1_t Packet1d;
5085 
5086 EIGEN_ALWAYS_INLINE Packet2d make_packet2d(double a, double b) { return Packet2d{a, b}; }
5087 #endif
5088 
5089 // functionally equivalent to _mm_shuffle_pd in SSE (i.e. shuffle(m, n, mask) equals _mm_shuffle_pd(m,n,mask))
5090 // Currently used in LU/arch/InverseSize4.h to enable a shared implementation
5091 // for fast inversion of matrices of size 4.
5092 EIGEN_STRONG_INLINE Packet2d shuffle(const Packet2d& m, const Packet2d& n, int mask) {
5093  const double* a = reinterpret_cast<const double*>(&m);
5094  const double* b = reinterpret_cast<const double*>(&n);
5095  Packet2d res = make_packet2d(*(a + (mask & 1)), *(b + ((mask >> 1) & 1)));
5096  return res;
5097 }
5098 
5099 EIGEN_STRONG_INLINE Packet2d vec2d_swizzle2(const Packet2d& a, const Packet2d& b, int mask) {
5100  return shuffle(a, b, mask);
5101 }
5102 EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a, const Packet2d& b) { return shuffle(a, b, 0); }
5103 EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a, const Packet2d& b) { return shuffle(a, b, 3); }
5104 #define vec2d_duplane(a, p) Packet2d(vdupq_laneq_f64(a, p))
5105 
5106 template <>
5107 struct packet_traits<double> : default_packet_traits {
5108  typedef Packet2d type;
5109  typedef Packet2d half;
5110  enum {
5111  Vectorizable = 1,
5112  AlignedOnScalar = 1,
5113  size = 2,
5114 
5115  HasCmp = 1,
5116  HasAdd = 1,
5117  HasSub = 1,
5118  HasShift = 1,
5119  HasMul = 1,
5120  HasNegate = 1,
5121  HasAbs = 1,
5122  HasArg = 0,
5123  HasAbs2 = 1,
5124  HasAbsDiff = 1,
5125  HasMin = 1,
5126  HasMax = 1,
5127  HasConj = 1,
5128  HasSetLinear = 1,
5129  HasBlend = 0,
5130 
5131  HasDiv = 1,
5132 
5133 #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
5134  HasExp = 1,
5135  HasLog = 1,
5136  HasATan = 1,
5137  HasATanh = 1,
5138 #endif
5141  HasSqrt = 1,
5142  HasRsqrt = 1,
5146  };
5147 };
5148 
5149 template <>
5150 struct unpacket_traits<Packet2d> {
5151  typedef double type;
5152  typedef Packet2d half;
5153  typedef Packet2l integer_packet;
5154  enum {
5155  size = 2,
5156  alignment = Aligned16,
5157  vectorizable = true,
5158  masked_load_available = false,
5159  masked_store_available = false
5160  };
5161 };
5162 
5163 template <>
5164 EIGEN_STRONG_INLINE Packet2d pzero<Packet2d>(const Packet2d& /*a*/) {
5165  return vdupq_n_f64(0.0);
5166 }
5167 
5168 template <>
5169 EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
5170  return vdupq_n_f64(from);
5171 }
5172 
5173 template <>
5175  const double c[] = {0.0, 1.0};
5176  return vaddq_f64(pset1<Packet2d>(a), vld1q_f64(c));
5177 }
5178 
5179 template <>
5181  return vaddq_f64(a, b);
5182 }
5183 
5184 template <>
5186  return vsubq_f64(a, b);
5187 }
5188 
5189 template <>
5191 template <>
5193  const Packet2d mask = make_packet2d(numext::bit_cast<double>(0x8000000000000000ull), 0.0);
5194  return padd(a, pxor(mask, b));
5195 }
5196 
5197 template <>
5199  return vnegq_f64(a);
5200 }
5201 
5202 template <>
5204  return a;
5205 }
5206 
5207 template <>
5209  return vmulq_f64(a, b);
5210 }
5211 
5212 template <>
5214  return vdivq_f64(a, b);
5215 }
5216 
5217 #ifdef EIGEN_VECTORIZE_FMA
5218 // See bug 936. See above comment about FMA for float.
5219 template <>
5220 EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
5221  return vfmaq_f64(c, a, b);
5222 }
5223 #else
5224 template <>
5225 EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
5226  return vmlaq_f64(c, a, b);
5227 }
5228 #endif
5229 
5230 template <>
5232  return vminq_f64(a, b);
5233 }
5234 
5235 #ifdef __ARM_FEATURE_NUMERIC_MAXMIN
5236 // numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8
5237 // systems).
5238 template <>
5240  return vminnmq_f64(a, b);
5241 }
5242 template <>
5244  return vmaxnmq_f64(a, b);
5245 }
5246 
5247 #endif
5248 
5249 template <>
5251  return pmin<Packet2d>(a, b);
5252 }
5253 
5254 template <>
5256  return vmaxq_f64(a, b);
5257 }
5258 
5259 template <>
5261  return pmax<Packet2d>(a, b);
5262 }
5263 
5264 // Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
5265 template <>
5267  return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a), vreinterpretq_u64_f64(b)));
5268 }
5269 
5270 template <>
5272  return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a), vreinterpretq_u64_f64(b)));
5273 }
5274 
5275 template <>
5277  return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a), vreinterpretq_u64_f64(b)));
5278 }
5279 
5280 template <>
5282  return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a), vreinterpretq_u64_f64(b)));
5283 }
5284 
5285 template <>
5287  return vreinterpretq_f64_u64(vcleq_f64(a, b));
5288 }
5289 
5290 template <>
5292  return vreinterpretq_f64_u64(vcltq_f64(a, b));
5293 }
5294 
5295 template <>
5297  return vreinterpretq_f64_u32(vmvnq_u32(vreinterpretq_u32_u64(vcgeq_f64(a, b))));
5298 }
5299 
5300 template <>
5302  return vreinterpretq_f64_u64(vceqq_f64(a, b));
5303 }
5304 
5305 template <>
5306 EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
5307  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from);
5308 }
5309 
5310 template <>
5311 EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
5312  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from);
5313 }
5314 
5315 template <>
5316 EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
5317  return vld1q_dup_f64(from);
5318 }
5319 template <>
5320 EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
5321  EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to, from);
5322 }
5323 
5324 template <>
5325 EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
5326  EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to, from);
5327 }
5328 
5329 template <>
5331  Packet2d res = pset1<Packet2d>(0.0);
5332  res = vld1q_lane_f64(from + 0 * stride, res, 0);
5333  res = vld1q_lane_f64(from + 1 * stride, res, 1);
5334  return res;
5335 }
5336 
5337 template <>
5338 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) {
5339  vst1q_lane_f64(to + stride * 0, from, 0);
5340  vst1q_lane_f64(to + stride * 1, from, 1);
5341 }
5342 
5343 template <>
5344 EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
5345  EIGEN_ARM_PREFETCH(addr);
5346 }
5347 
5348 // FIXME only store the 2 first elements ?
5349 template <>
5351  return vgetq_lane_f64(a, 0);
5352 }
5353 
5354 template <>
5356  return vcombine_f64(vget_high_f64(a), vget_low_f64(a));
5357 }
5358 
5359 template <>
5361  return vabsq_f64(a);
5362 }
5363 
5364 template <>
5366  return vreinterpretq_f64_s64(vshrq_n_s64(vreinterpretq_s64_f64(a), 63));
5367 }
5368 
5369 template <>
5371  return vaddvq_f64(a);
5372 }
5373 
5374 // Other reduction functions:
5375 // mul
5376 #if EIGEN_COMP_CLANGAPPLE
5377 template <>
5379  return (vget_low_f64(a) * vget_high_f64(a))[0];
5380 }
5381 #else
5382 template <>
5384  return vget_lane_f64(vmul_f64(vget_low_f64(a), vget_high_f64(a)), 0);
5385 }
5386 #endif
5387 
5388 // min
5389 template <>
5391  return vminvq_f64(a);
5392 }
5393 
5394 // max
5395 template <>
5397  return vmaxvq_f64(a);
5398 }
5399 
5400 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
5401  const float64x2_t tmp1 = vzip1q_f64(kernel.packet[0], kernel.packet[1]);
5402  const float64x2_t tmp2 = vzip2q_f64(kernel.packet[0], kernel.packet[1]);
5403 
5404  kernel.packet[0] = tmp1;
5405  kernel.packet[1] = tmp2;
5406 }
5407 
5408 template <>
5410  return vbslq_f64(vreinterpretq_u64_f64(mask), a, b);
5411 }
5412 
5413 template <>
5415  return vrndnq_f64(a);
5416 }
5417 
5418 template <>
5420  return vrndmq_f64(a);
5421 }
5422 
5423 template <>
5425  return vrndpq_f64(a);
5426 }
5427 
5428 template <>
5430  return vrndaq_f64(a);
5431 }
5432 
5433 template <>
5435  return vrndq_f64(a);
5436 }
5437 
5438 template <>
5440  return pldexp_generic(a, exponent);
5441 }
5442 
5443 template <>
5445  return pfrexp_generic(a, exponent);
5446 }
5447 
5448 template <>
5450  return vreinterpretq_f64_u64(vdupq_n_u64(from));
5451 }
5452 
5453 template <>
5455  // Do Newton iterations for 1/sqrt(x).
5456  return generic_rsqrt_newton_step<Packet2d, /*Steps=*/3>::run(a, vrsqrteq_f64(a));
5457 }
5458 
5459 template <>
5461  return vsqrtq_f64(_x);
5462 }
5463 
5464 #endif // EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
5465 
5466 // Do we have an fp16 types and supporting Neon intrinsics?
5467 #if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
5468 typedef float16x4_t Packet4hf;
5469 typedef float16x8_t Packet8hf;
5470 
5471 template <>
5472 struct packet_traits<Eigen::half> : default_packet_traits {
5473  typedef Packet8hf type;
5474  typedef Packet4hf half;
5475  enum {
5476  Vectorizable = 1,
5477  AlignedOnScalar = 1,
5478  size = 8,
5479 
5480  HasCmp = 1,
5481  HasCast = 1,
5482  HasAdd = 1,
5483  HasSub = 1,
5484  HasShift = 1,
5485  HasMul = 1,
5486  HasNegate = 1,
5487  HasAbs = 1,
5488  HasArg = 0,
5489  HasAbs2 = 1,
5490  HasAbsDiff = 0,
5491  HasMin = 1,
5492  HasMax = 1,
5493  HasConj = 1,
5494  HasSetLinear = 1,
5495  HasBlend = 0,
5496  HasInsert = 1,
5497  HasReduxp = 1,
5498  HasDiv = 1,
5499  HasSin = 0,
5500  HasCos = 0,
5501  HasLog = 0,
5502  HasExp = 0,
5503  HasTanh = packet_traits<float>::HasTanh, // tanh<half> calls tanh<float>
5504  HasSqrt = 1,
5505  HasRsqrt = 1,
5507  HasBessel = 0, // Issues with accuracy.
5508  HasNdtri = 0
5509  };
5510 };
5511 
5512 template <>
5513 struct unpacket_traits<Packet4hf> {
5514  typedef Eigen::half type;
5515  typedef Packet4hf half;
5516  enum {
5517  size = 4,
5518  alignment = Aligned16,
5519  vectorizable = true,
5520  masked_load_available = false,
5521  masked_store_available = false
5522  };
5523 };
5524 
5525 template <>
5526 struct unpacket_traits<Packet8hf> {
5527  typedef Eigen::half type;
5528  typedef Packet4hf half;
5529  enum {
5530  size = 8,
5531  alignment = Aligned16,
5532  vectorizable = true,
5533  masked_load_available = false,
5534  masked_store_available = false
5535  };
5536 };
5537 
5538 template <>
5539 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf predux_half_dowto4<Packet8hf>(const Packet8hf& a) {
5540  return vadd_f16(vget_low_f16(a), vget_high_f16(a));
5541 }
5542 
5543 template <>
5544 EIGEN_STRONG_INLINE Packet8hf pset1<Packet8hf>(const Eigen::half& from) {
5545  return vdupq_n_f16(from.x);
5546 }
5547 
5548 template <>
5549 EIGEN_STRONG_INLINE Packet4hf pset1<Packet4hf>(const Eigen::half& from) {
5550  return vdup_n_f16(from.x);
5551 }
5552 
5553 template <>
5554 EIGEN_STRONG_INLINE Packet8hf plset<Packet8hf>(const Eigen::half& a) {
5555  const float16_t f[] = {0, 1, 2, 3, 4, 5, 6, 7};
5556  Packet8hf countdown = vld1q_f16(f);
5557  return vaddq_f16(pset1<Packet8hf>(a), countdown);
5558 }
5559 
5560 template <>
5561 EIGEN_STRONG_INLINE Packet4hf plset<Packet4hf>(const Eigen::half& a) {
5562  const float16_t f[] = {0, 1, 2, 3};
5563  Packet4hf countdown = vld1_f16(f);
5564  return vadd_f16(pset1<Packet4hf>(a), countdown);
5565 }
5566 
5567 template <>
5568 EIGEN_STRONG_INLINE Packet8hf padd<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5569  return vaddq_f16(a, b);
5570 }
5571 
5572 template <>
5573 EIGEN_STRONG_INLINE Packet4hf padd<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5574  return vadd_f16(a, b);
5575 }
5576 
5577 template <>
5578 EIGEN_STRONG_INLINE Packet8hf psub<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5579  return vsubq_f16(a, b);
5580 }
5581 
5582 template <>
5583 EIGEN_STRONG_INLINE Packet4hf psub<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5584  return vsub_f16(a, b);
5585 }
5586 
5587 template <>
5588 EIGEN_STRONG_INLINE Packet8hf pnegate(const Packet8hf& a) {
5589  return vnegq_f16(a);
5590 }
5591 
5592 template <>
5593 EIGEN_STRONG_INLINE Packet4hf pnegate(const Packet4hf& a) {
5594  return vneg_f16(a);
5595 }
5596 
5597 template <>
5598 EIGEN_STRONG_INLINE Packet8hf pconj(const Packet8hf& a) {
5599  return a;
5600 }
5601 
5602 template <>
5603 EIGEN_STRONG_INLINE Packet4hf pconj(const Packet4hf& a) {
5604  return a;
5605 }
5606 
5607 template <>
5608 EIGEN_STRONG_INLINE Packet8hf pmul<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5609  return vmulq_f16(a, b);
5610 }
5611 
5612 template <>
5613 EIGEN_STRONG_INLINE Packet4hf pmul<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5614  return vmul_f16(a, b);
5615 }
5616 
5617 template <>
5618 EIGEN_STRONG_INLINE Packet8hf pdiv<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5619  return vdivq_f16(a, b);
5620 }
5621 
5622 template <>
5623 EIGEN_STRONG_INLINE Packet4hf pdiv<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5624  return vdiv_f16(a, b);
5625 }
5626 
5627 template <>
5628 EIGEN_STRONG_INLINE Packet8hf pmadd(const Packet8hf& a, const Packet8hf& b, const Packet8hf& c) {
5629  return vfmaq_f16(c, a, b);
5630 }
5631 
5632 template <>
5633 EIGEN_STRONG_INLINE Packet4hf pmadd(const Packet4hf& a, const Packet4hf& b, const Packet4hf& c) {
5634  return vfma_f16(c, a, b);
5635 }
5636 
5637 template <>
5638 EIGEN_STRONG_INLINE Packet8hf pmin<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5639  return vminq_f16(a, b);
5640 }
5641 
5642 template <>
5643 EIGEN_STRONG_INLINE Packet4hf pmin<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5644  return vmin_f16(a, b);
5645 }
5646 
5647 #ifdef __ARM_FEATURE_NUMERIC_MAXMIN
5648 // numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8
5649 // systems).
5650 template <>
5651 EIGEN_STRONG_INLINE Packet4hf pmin<PropagateNumbers, Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5652  return vminnm_f16(a, b);
5653 }
5654 template <>
5655 EIGEN_STRONG_INLINE Packet8hf pmin<PropagateNumbers, Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5656  return vminnmq_f16(a, b);
5657 }
5658 #endif
5659 
5660 template <>
5661 EIGEN_STRONG_INLINE Packet4hf pmin<PropagateNaN, Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5662  return pmin<Packet4hf>(a, b);
5663 }
5664 
5665 template <>
5666 EIGEN_STRONG_INLINE Packet8hf pmin<PropagateNaN, Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5667  return pmin<Packet8hf>(a, b);
5668 }
5669 
5670 template <>
5671 EIGEN_STRONG_INLINE Packet8hf pmax<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5672  return vmaxq_f16(a, b);
5673 }
5674 
5675 template <>
5676 EIGEN_STRONG_INLINE Packet4hf pmax<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5677  return vmax_f16(a, b);
5678 }
5679 
5680 #ifdef __ARM_FEATURE_NUMERIC_MAXMIN
5681 // numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8
5682 // systems).
5683 template <>
5684 EIGEN_STRONG_INLINE Packet4hf pmax<PropagateNumbers, Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5685  return vmaxnm_f16(a, b);
5686 }
5687 template <>
5688 EIGEN_STRONG_INLINE Packet8hf pmax<PropagateNumbers, Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5689  return vmaxnmq_f16(a, b);
5690 }
5691 #endif
5692 
5693 template <>
5694 EIGEN_STRONG_INLINE Packet4hf pmax<PropagateNaN, Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5695  return pmax<Packet4hf>(a, b);
5696 }
5697 
5698 template <>
5699 EIGEN_STRONG_INLINE Packet8hf pmax<PropagateNaN, Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5700  return pmax<Packet8hf>(a, b);
5701 }
5702 
5703 #define EIGEN_MAKE_ARM_FP16_CMP_8(name) \
5704  template <> \
5705  EIGEN_STRONG_INLINE Packet8hf pcmp_##name(const Packet8hf& a, const Packet8hf& b) { \
5706  return vreinterpretq_f16_u16(vc##name##q_f16(a, b)); \
5707  }
5708 
5709 #define EIGEN_MAKE_ARM_FP16_CMP_4(name) \
5710  template <> \
5711  EIGEN_STRONG_INLINE Packet4hf pcmp_##name(const Packet4hf& a, const Packet4hf& b) { \
5712  return vreinterpret_f16_u16(vc##name##_f16(a, b)); \
5713  }
5714 
5715 EIGEN_MAKE_ARM_FP16_CMP_8(eq)
5716 EIGEN_MAKE_ARM_FP16_CMP_8(lt)
5717 EIGEN_MAKE_ARM_FP16_CMP_8(le)
5718 
5719 EIGEN_MAKE_ARM_FP16_CMP_4(eq)
5720 EIGEN_MAKE_ARM_FP16_CMP_4(lt)
5721 EIGEN_MAKE_ARM_FP16_CMP_4(le)
5722 
5723 #undef EIGEN_MAKE_ARM_FP16_CMP_8
5724 #undef EIGEN_MAKE_ARM_FP16_CMP_4
5725 
5726 template <>
5727 EIGEN_STRONG_INLINE Packet8hf pcmp_lt_or_nan<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5728  return vreinterpretq_f16_u16(vmvnq_u16(vcgeq_f16(a, b)));
5729 }
5730 
5731 template <>
5732 EIGEN_STRONG_INLINE Packet4hf pcmp_lt_or_nan<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5733  return vreinterpret_f16_u16(vmvn_u16(vcge_f16(a, b)));
5734 }
5735 
5736 template <>
5737 EIGEN_STRONG_INLINE Packet8hf print<Packet8hf>(const Packet8hf& a) {
5738  return vrndnq_f16(a);
5739 }
5740 
5741 template <>
5742 EIGEN_STRONG_INLINE Packet4hf print<Packet4hf>(const Packet4hf& a) {
5743  return vrndn_f16(a);
5744 }
5745 
5746 template <>
5747 EIGEN_STRONG_INLINE Packet8hf pfloor<Packet8hf>(const Packet8hf& a) {
5748  return vrndmq_f16(a);
5749 }
5750 
5751 template <>
5752 EIGEN_STRONG_INLINE Packet4hf pfloor<Packet4hf>(const Packet4hf& a) {
5753  return vrndm_f16(a);
5754 }
5755 
5756 template <>
5757 EIGEN_STRONG_INLINE Packet8hf pceil<Packet8hf>(const Packet8hf& a) {
5758  return vrndpq_f16(a);
5759 }
5760 
5761 template <>
5762 EIGEN_STRONG_INLINE Packet4hf pceil<Packet4hf>(const Packet4hf& a) {
5763  return vrndp_f16(a);
5764 }
5765 
5766 template <>
5767 EIGEN_STRONG_INLINE Packet8hf pround<Packet8hf>(const Packet8hf& a) {
5768  return vrndaq_f16(a);
5769 }
5770 
5771 template <>
5772 EIGEN_STRONG_INLINE Packet4hf pround<Packet4hf>(const Packet4hf& a) {
5773  return vrnda_f16(a);
5774 }
5775 
5776 template <>
5777 EIGEN_STRONG_INLINE Packet8hf ptrunc<Packet8hf>(const Packet8hf& a) {
5778  return vrndq_f16(a);
5779 }
5780 
5781 template <>
5782 EIGEN_STRONG_INLINE Packet4hf ptrunc<Packet4hf>(const Packet4hf& a) {
5783  return vrnd_f16(a);
5784 }
5785 
5786 template <>
5787 EIGEN_STRONG_INLINE Packet8hf psqrt<Packet8hf>(const Packet8hf& a) {
5788  return vsqrtq_f16(a);
5789 }
5790 
5791 template <>
5792 EIGEN_STRONG_INLINE Packet4hf psqrt<Packet4hf>(const Packet4hf& a) {
5793  return vsqrt_f16(a);
5794 }
5795 
5796 template <>
5797 EIGEN_STRONG_INLINE Packet8hf pand<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5798  return vreinterpretq_f16_u16(vandq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
5799 }
5800 
5801 template <>
5802 EIGEN_STRONG_INLINE Packet4hf pand<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5803  return vreinterpret_f16_u16(vand_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
5804 }
5805 
5806 template <>
5807 EIGEN_STRONG_INLINE Packet8hf por<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5808  return vreinterpretq_f16_u16(vorrq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
5809 }
5810 
5811 template <>
5812 EIGEN_STRONG_INLINE Packet4hf por<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5813  return vreinterpret_f16_u16(vorr_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
5814 }
5815 
5816 template <>
5817 EIGEN_STRONG_INLINE Packet8hf pxor<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5818  return vreinterpretq_f16_u16(veorq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
5819 }
5820 
5821 template <>
5822 EIGEN_STRONG_INLINE Packet4hf pxor<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5823  return vreinterpret_f16_u16(veor_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
5824 }
5825 
5826 template <>
5827 EIGEN_STRONG_INLINE Packet8hf pandnot<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5828  return vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
5829 }
5830 
5831 template <>
5832 EIGEN_STRONG_INLINE Packet4hf pandnot<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5833  return vreinterpret_f16_u16(vbic_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
5834 }
5835 
5836 template <>
5837 EIGEN_STRONG_INLINE Packet8hf pload<Packet8hf>(const Eigen::half* from) {
5838  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f16(reinterpret_cast<const float16_t*>(from));
5839 }
5840 
5841 template <>
5842 EIGEN_STRONG_INLINE Packet4hf pload<Packet4hf>(const Eigen::half* from) {
5843  EIGEN_DEBUG_ALIGNED_LOAD return vld1_f16(reinterpret_cast<const float16_t*>(from));
5844 }
5845 
5846 template <>
5847 EIGEN_STRONG_INLINE Packet8hf ploadu<Packet8hf>(const Eigen::half* from) {
5848  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f16(reinterpret_cast<const float16_t*>(from));
5849 }
5850 
5851 template <>
5852 EIGEN_STRONG_INLINE Packet4hf ploadu<Packet4hf>(const Eigen::half* from) {
5853  EIGEN_DEBUG_UNALIGNED_LOAD return vld1_f16(reinterpret_cast<const float16_t*>(from));
5854 }
5855 
5856 template <>
5857 EIGEN_STRONG_INLINE Packet8hf ploaddup<Packet8hf>(const Eigen::half* from) {
5858  Packet8hf packet;
5859  packet[0] = from[0].x;
5860  packet[1] = from[0].x;
5861  packet[2] = from[1].x;
5862  packet[3] = from[1].x;
5863  packet[4] = from[2].x;
5864  packet[5] = from[2].x;
5865  packet[6] = from[3].x;
5866  packet[7] = from[3].x;
5867  return packet;
5868 }
5869 
5870 template <>
5871 EIGEN_STRONG_INLINE Packet4hf ploaddup<Packet4hf>(const Eigen::half* from) {
5872  float16x4_t packet;
5873  float16_t* tmp;
5874  tmp = (float16_t*)&packet;
5875  tmp[0] = from[0].x;
5876  tmp[1] = from[0].x;
5877  tmp[2] = from[1].x;
5878  tmp[3] = from[1].x;
5879  return packet;
5880 }
5881 
5882 template <>
5883 EIGEN_STRONG_INLINE Packet8hf ploadquad<Packet8hf>(const Eigen::half* from) {
5884  Packet4hf lo, hi;
5885  lo = vld1_dup_f16(reinterpret_cast<const float16_t*>(from));
5886  hi = vld1_dup_f16(reinterpret_cast<const float16_t*>(from + 1));
5887  return vcombine_f16(lo, hi);
5888 }
5889 
5890 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pinsertfirst(const Packet8hf& a, Eigen::half b) {
5891  return vsetq_lane_f16(b.x, a, 0);
5892 }
5893 
5894 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertfirst(const Packet4hf& a, Eigen::half b) {
5895  return vset_lane_f16(b.x, a, 0);
5896 }
5897 
5898 template <>
5899 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pselect(const Packet8hf& mask, const Packet8hf& a, const Packet8hf& b) {
5900  return vbslq_f16(vreinterpretq_u16_f16(mask), a, b);
5901 }
5902 
5903 template <>
5904 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pselect(const Packet4hf& mask, const Packet4hf& a, const Packet4hf& b) {
5905  return vbsl_f16(vreinterpret_u16_f16(mask), a, b);
5906 }
5907 
5908 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pinsertlast(const Packet8hf& a, Eigen::half b) {
5909  return vsetq_lane_f16(b.x, a, 7);
5910 }
5911 
5912 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertlast(const Packet4hf& a, Eigen::half b) {
5913  return vset_lane_f16(b.x, a, 3);
5914 }
5915 
5916 template <>
5917 EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8hf& from) {
5918  EIGEN_DEBUG_ALIGNED_STORE vst1q_f16(reinterpret_cast<float16_t*>(to), from);
5919 }
5920 
5921 template <>
5922 EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4hf& from) {
5923  EIGEN_DEBUG_ALIGNED_STORE vst1_f16(reinterpret_cast<float16_t*>(to), from);
5924 }
5925 
5926 template <>
5927 EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet8hf& from) {
5928  EIGEN_DEBUG_UNALIGNED_STORE vst1q_f16(reinterpret_cast<float16_t*>(to), from);
5929 }
5930 
5931 template <>
5932 EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4hf& from) {
5933  EIGEN_DEBUG_UNALIGNED_STORE vst1_f16(reinterpret_cast<float16_t*>(to), from);
5934 }
5935 
5936 template <>
5937 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pgather<Eigen::half, Packet8hf>(const Eigen::half* from, Index stride) {
5938  Packet8hf res = pset1<Packet8hf>(Eigen::half(0.f));
5939  res = vsetq_lane_f16(from[0 * stride].x, res, 0);
5940  res = vsetq_lane_f16(from[1 * stride].x, res, 1);
5941  res = vsetq_lane_f16(from[2 * stride].x, res, 2);
5942  res = vsetq_lane_f16(from[3 * stride].x, res, 3);
5943  res = vsetq_lane_f16(from[4 * stride].x, res, 4);
5944  res = vsetq_lane_f16(from[5 * stride].x, res, 5);
5945  res = vsetq_lane_f16(from[6 * stride].x, res, 6);
5946  res = vsetq_lane_f16(from[7 * stride].x, res, 7);
5947  return res;
5948 }
5949 
5950 template <>
5951 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pgather<Eigen::half, Packet4hf>(const Eigen::half* from, Index stride) {
5952  Packet4hf res = pset1<Packet4hf>(Eigen::half(0.f));
5953  res = vset_lane_f16(from[0 * stride].x, res, 0);
5954  res = vset_lane_f16(from[1 * stride].x, res, 1);
5955  res = vset_lane_f16(from[2 * stride].x, res, 2);
5956  res = vset_lane_f16(from[3 * stride].x, res, 3);
5957  return res;
5958 }
5959 
5960 template <>
5961 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8hf>(Eigen::half* to, const Packet8hf& from,
5962  Index stride) {
5963  to[stride * 0].x = vgetq_lane_f16(from, 0);
5964  to[stride * 1].x = vgetq_lane_f16(from, 1);
5965  to[stride * 2].x = vgetq_lane_f16(from, 2);
5966  to[stride * 3].x = vgetq_lane_f16(from, 3);
5967  to[stride * 4].x = vgetq_lane_f16(from, 4);
5968  to[stride * 5].x = vgetq_lane_f16(from, 5);
5969  to[stride * 6].x = vgetq_lane_f16(from, 6);
5970  to[stride * 7].x = vgetq_lane_f16(from, 7);
5971 }
5972 
5973 template <>
5974 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4hf>(Eigen::half* to, const Packet4hf& from,
5975  Index stride) {
5976  to[stride * 0].x = vget_lane_f16(from, 0);
5977  to[stride * 1].x = vget_lane_f16(from, 1);
5978  to[stride * 2].x = vget_lane_f16(from, 2);
5979  to[stride * 3].x = vget_lane_f16(from, 3);
5980 }
5981 
5982 template <>
5983 EIGEN_STRONG_INLINE void prefetch<Eigen::half>(const Eigen::half* addr) {
5984  EIGEN_ARM_PREFETCH(addr);
5985 }
5986 
5987 template <>
5988 EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8hf>(const Packet8hf& a) {
5989  float16_t x[8];
5990  vst1q_f16(x, a);
5991  Eigen::half h;
5992  h.x = x[0];
5993  return h;
5994 }
5995 
5996 template <>
5997 EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4hf>(const Packet4hf& a) {
5998  float16_t x[4];
5999  vst1_f16(x, a);
6000  Eigen::half h;
6001  h.x = x[0];
6002  return h;
6003 }
6004 
6005 template <>
6006 EIGEN_STRONG_INLINE Packet8hf preverse(const Packet8hf& a) {
6007  float16x4_t a_lo, a_hi;
6008  Packet8hf a_r64;
6009 
6010  a_r64 = vrev64q_f16(a);
6011  a_lo = vget_low_f16(a_r64);
6012  a_hi = vget_high_f16(a_r64);
6013  return vcombine_f16(a_hi, a_lo);
6014 }
6015 
6016 template <>
6017 EIGEN_STRONG_INLINE Packet4hf preverse<Packet4hf>(const Packet4hf& a) {
6018  return vrev64_f16(a);
6019 }
6020 
6021 template <>
6022 EIGEN_STRONG_INLINE Packet8hf pabs<Packet8hf>(const Packet8hf& a) {
6023  return vabsq_f16(a);
6024 }
6025 
6026 template <>
6027 EIGEN_STRONG_INLINE Packet8hf psignbit(const Packet8hf& a) {
6028  return vreinterpretq_f16_s16(vshrq_n_s16(vreinterpretq_s16_f16(a), 15));
6029 }
6030 
6031 template <>
6032 EIGEN_STRONG_INLINE Packet4hf pabs<Packet4hf>(const Packet4hf& a) {
6033  return vabs_f16(a);
6034 }
6035 
6036 template <>
6037 EIGEN_STRONG_INLINE Packet4hf psignbit(const Packet4hf& a) {
6038  return vreinterpret_f16_s16(vshr_n_s16(vreinterpret_s16_f16(a), 15));
6039 }
6040 
6041 template <>
6042 EIGEN_STRONG_INLINE Eigen::half predux<Packet8hf>(const Packet8hf& a) {
6043  float16x4_t a_lo, a_hi, sum;
6044 
6045  a_lo = vget_low_f16(a);
6046  a_hi = vget_high_f16(a);
6047  sum = vpadd_f16(a_lo, a_hi);
6048  sum = vpadd_f16(sum, sum);
6049  sum = vpadd_f16(sum, sum);
6050 
6051  Eigen::half h;
6052  h.x = vget_lane_f16(sum, 0);
6053  return h;
6054 }
6055 
6056 template <>
6057 EIGEN_STRONG_INLINE Eigen::half predux<Packet4hf>(const Packet4hf& a) {
6058  float16x4_t sum;
6059 
6060  sum = vpadd_f16(a, a);
6061  sum = vpadd_f16(sum, sum);
6062  Eigen::half h;
6063  h.x = vget_lane_f16(sum, 0);
6064  return h;
6065 }
6066 
6067 template <>
6068 EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8hf>(const Packet8hf& a) {
6069  float16x4_t a_lo, a_hi, prod;
6070 
6071  a_lo = vget_low_f16(a);
6072  a_hi = vget_high_f16(a);
6073  prod = vmul_f16(a_lo, a_hi);
6074  prod = vmul_f16(prod, vrev64_f16(prod));
6075 
6076  Eigen::half h;
6077  h.x = vmulh_f16(vget_lane_f16(prod, 0), vget_lane_f16(prod, 1));
6078  return h;
6079 }
6080 
6081 template <>
6082 EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet4hf>(const Packet4hf& a) {
6083  float16x4_t prod;
6084  prod = vmul_f16(a, vrev64_f16(a));
6085  Eigen::half h;
6086  h.x = vmulh_f16(vget_lane_f16(prod, 0), vget_lane_f16(prod, 1));
6087  return h;
6088 }
6089 
6090 template <>
6091 EIGEN_STRONG_INLINE Eigen::half predux_min<Packet8hf>(const Packet8hf& a) {
6092  Eigen::half h;
6093  h.x = vminvq_f16(a);
6094  return h;
6095 }
6096 
6097 template <>
6098 EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4hf>(const Packet4hf& a) {
6099  Eigen::half h;
6100  h.x = vminv_f16(a);
6101  return h;
6102 }
6103 
6104 template <>
6105 EIGEN_STRONG_INLINE Eigen::half predux_max<Packet8hf>(const Packet8hf& a) {
6106  Eigen::half h;
6107  h.x = vmaxvq_f16(a);
6108  return h;
6109 }
6110 
6111 template <>
6112 EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4hf>(const Packet4hf& a) {
6113  Eigen::half h;
6114  h.x = vmaxv_f16(a);
6115  return h;
6116 }
6117 
6118 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8hf, 4>& kernel) {
6119  const float16x8x2_t zip16_1 = vzipq_f16(kernel.packet[0], kernel.packet[1]);
6120  const float16x8x2_t zip16_2 = vzipq_f16(kernel.packet[2], kernel.packet[3]);
6121 
6122  const float32x4x2_t zip32_1 = vzipq_f32(vreinterpretq_f32_f16(zip16_1.val[0]), vreinterpretq_f32_f16(zip16_2.val[0]));
6123  const float32x4x2_t zip32_2 = vzipq_f32(vreinterpretq_f32_f16(zip16_1.val[1]), vreinterpretq_f32_f16(zip16_2.val[1]));
6124 
6125  kernel.packet[0] = vreinterpretq_f16_f32(zip32_1.val[0]);
6126  kernel.packet[1] = vreinterpretq_f16_f32(zip32_1.val[1]);
6127  kernel.packet[2] = vreinterpretq_f16_f32(zip32_2.val[0]);
6128  kernel.packet[3] = vreinterpretq_f16_f32(zip32_2.val[1]);
6129 }
6130 
6131 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4hf, 4>& kernel) {
6132  EIGEN_ALIGN16 float16x4x4_t tmp_x4;
6133  float16_t* tmp = (float16_t*)&kernel;
6134  tmp_x4 = vld4_f16(tmp);
6135 
6136  kernel.packet[0] = tmp_x4.val[0];
6137  kernel.packet[1] = tmp_x4.val[1];
6138  kernel.packet[2] = tmp_x4.val[2];
6139  kernel.packet[3] = tmp_x4.val[3];
6140 }
6141 
6142 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8hf, 8>& kernel) {
6143  float16x8x2_t T_1[4];
6144 
6145  T_1[0] = vuzpq_f16(kernel.packet[0], kernel.packet[1]);
6146  T_1[1] = vuzpq_f16(kernel.packet[2], kernel.packet[3]);
6147  T_1[2] = vuzpq_f16(kernel.packet[4], kernel.packet[5]);
6148  T_1[3] = vuzpq_f16(kernel.packet[6], kernel.packet[7]);
6149 
6150  float16x8x2_t T_2[4];
6151  T_2[0] = vuzpq_f16(T_1[0].val[0], T_1[1].val[0]);
6152  T_2[1] = vuzpq_f16(T_1[0].val[1], T_1[1].val[1]);
6153  T_2[2] = vuzpq_f16(T_1[2].val[0], T_1[3].val[0]);
6154  T_2[3] = vuzpq_f16(T_1[2].val[1], T_1[3].val[1]);
6155 
6156  float16x8x2_t T_3[4];
6157  T_3[0] = vuzpq_f16(T_2[0].val[0], T_2[2].val[0]);
6158  T_3[1] = vuzpq_f16(T_2[0].val[1], T_2[2].val[1]);
6159  T_3[2] = vuzpq_f16(T_2[1].val[0], T_2[3].val[0]);
6160  T_3[3] = vuzpq_f16(T_2[1].val[1], T_2[3].val[1]);
6161 
6162  kernel.packet[0] = T_3[0].val[0];
6163  kernel.packet[1] = T_3[2].val[0];
6164  kernel.packet[2] = T_3[1].val[0];
6165  kernel.packet[3] = T_3[3].val[0];
6166  kernel.packet[4] = T_3[0].val[1];
6167  kernel.packet[5] = T_3[2].val[1];
6168  kernel.packet[6] = T_3[1].val[1];
6169  kernel.packet[7] = T_3[3].val[1];
6170 }
6171 #endif // end EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
6172 
6173 } // end namespace internal
6174 
6175 } // end namespace Eigen
6176 
6177 #endif // EIGEN_PACKET_MATH_NEON_H
AnnoyingScalar abs(const AnnoyingScalar &x)
Definition: AnnoyingScalar.h:135
int i
Definition: BiCGSTAB_step_by_step.cpp:9
const unsigned n
Definition: CG3DPackingUnitTest.cpp:11
#define EIGEN_ALIGN16
Definition: ConfigureVectorization.h:142
#define EIGEN_DEBUG_ALIGNED_STORE
Definition: GenericPacketMath.h:38
#define EIGEN_DEBUG_ALIGNED_LOAD
Definition: GenericPacketMath.h:30
#define EIGEN_DEBUG_UNALIGNED_STORE
Definition: GenericPacketMath.h:42
#define EIGEN_DEBUG_UNALIGNED_LOAD
Definition: GenericPacketMath.h:34
#define EIGEN_ALWAYS_INLINE
Definition: Macros.h:845
#define EIGEN_UNROLL_LOOP
Definition: Macros.h:1298
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:892
#define eigen_assert(x)
Definition: Macros.h:910
#define EIGEN_FAST_MATH
Definition: Macros.h:51
#define EIGEN_STRONG_INLINE
Definition: Macros.h:834
Vector3f p1
Definition: MatrixBase_all.cpp:2
#define EIGEN_ARM_PREFETCH(ADDR)
Definition: NEON/PacketMath.h:172
cout<< "Here is the matrix m:"<< endl<< m<< endl;Matrix< ptrdiff_t, 3, 1 > res
Definition: PartialRedux_count.cpp:3
float * p
Definition: Tutorial_Map_using.cpp:9
Scalar * b
Definition: benchVecAdd.cpp:17
EIGEN_STRONG_INLINE PacketScalar packet(Index rowId, Index colId) const
Definition: PlainObjectBase.h:247
Tag for template metaprogramming.
Definition: Logger.h:174
@ N
Definition: constructor.cpp:22
static int f(const TensorMap< Tensor< int, 3 > > &tensor)
Definition: cxx11_tensor_map.cpp:237
#define min(a, b)
Definition: datatypes.h:22
#define max(a, b)
Definition: datatypes.h:23
@ Unaligned
Definition: Constants.h:235
@ Aligned16
Definition: Constants.h:237
RealScalar s
Definition: level1_cplx_impl.h:130
const Scalar * a
Definition: level2_cplx_impl.h:32
int * m
Definition: level2_cplx_impl.h:294
char char char int int * k
Definition: level2_impl.h:374
Eigen::Matrix< Scalar, Dynamic, Dynamic, ColMajor > tmp
Definition: level3_impl.h:365
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw raw_uint16_to_bfloat16(unsigned short value)
EIGEN_ALWAYS_INLINE void zip_in_place< Packet8c >(Packet8c &p1, Packet8c &p2)
Definition: NEON/PacketMath.h:4125
void zip_in_place(Packet &p1, Packet &p2)
EIGEN_ALWAYS_INLINE void zip_in_place< Packet2i >(Packet2i &p1, Packet2i &p2)
Definition: NEON/PacketMath.h:4153
EIGEN_ALWAYS_INLINE void zip_in_place< Packet16c >(Packet16c &p1, Packet16c &p2)
Definition: NEON/PacketMath.h:4132
EIGEN_ALWAYS_INLINE void zip_in_place< Packet4i >(Packet4i &p1, Packet4i &p2)
Definition: NEON/PacketMath.h:4160
EIGEN_ALWAYS_INLINE void zip_in_place< Packet8uc >(Packet8uc &p1, Packet8uc &p2)
Definition: NEON/PacketMath.h:4139
EIGEN_ALWAYS_INLINE void zip_in_place< Packet2ui >(Packet2ui &p1, Packet2ui &p2)
Definition: NEON/PacketMath.h:4167
EIGEN_ALWAYS_INLINE void zip_in_place< Packet16uc >(Packet16uc &p1, Packet16uc &p2)
Definition: NEON/PacketMath.h:4146
EIGEN_ALWAYS_INLINE void zip_in_place< Packet4us >(Packet4us &p1, Packet4us &p2)
Definition: NEON/PacketMath.h:4195
EIGEN_ALWAYS_INLINE void zip_in_place< Packet4ui >(Packet4ui &p1, Packet4ui &p2)
Definition: NEON/PacketMath.h:4174
EIGEN_ALWAYS_INLINE void zip_in_place< Packet8s >(Packet8s &p1, Packet8s &p2)
Definition: NEON/PacketMath.h:4188
EIGEN_ALWAYS_INLINE void zip_in_place< Packet8us >(Packet8us &p1, Packet8us &p2)
Definition: NEON/PacketMath.h:4202
EIGEN_ALWAYS_INLINE void zip_in_place< Packet4f >(Packet4f &p1, Packet4f &p2)
Definition: NEON/PacketMath.h:4118
EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock< Packet, 2 > &kernel)
Definition: NEON/PacketMath.h:4209
EIGEN_ALWAYS_INLINE void zip_in_place< Packet4bf >(Packet4bf &p1, Packet4bf &p2)
Definition: NEON/PacketMath.h:4785
EIGEN_ALWAYS_INLINE void zip_in_place< Packet2f >(Packet2f &p1, Packet2f &p2)
Definition: NEON/PacketMath.h:4111
EIGEN_ALWAYS_INLINE void zip_in_place< Packet4s >(Packet4s &p1, Packet4s &p2)
Definition: NEON/PacketMath.h:4181
EIGEN_STRONG_INLINE int64_t predux_min< Packet2l >(const Packet2l &a)
Definition: LSX/PacketMath.h:2095
EIGEN_STRONG_INLINE unsigned char predux< Packet16uc >(const Packet16uc &a)
Definition: AltiVec/PacketMath.h:2515
EIGEN_STRONG_INLINE Packet8uc pmin< Packet8uc >(const Packet8uc &a, const Packet8uc &b)
Definition: NEON/PacketMath.h:1482
EIGEN_STRONG_INLINE Packet4f pandnot< Packet4f >(const Packet4f &a, const Packet4f &b)
Definition: AltiVec/PacketMath.h:1465
EIGEN_STRONG_INLINE Packet8s pabsdiff< Packet8s >(const Packet8s &a, const Packet8s &b)
Definition: LSX/PacketMath.h:2764
EIGEN_STRONG_INLINE Packet4ui psub< Packet4ui >(const Packet4ui &a, const Packet4ui &b)
Definition: LSX/PacketMath.h:634
EIGEN_STRONG_INLINE void pscatter< bfloat16, Packet4bf >(bfloat16 *to, const Packet4bf &from, Index stride)
Definition: NEON/PacketMath.h:4978
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pgather< uint32_t, Packet2ui >(const uint32_t *from, Index stride)
Definition: NEON/PacketMath.h:2972
EIGEN_STRONG_INLINE Packet2i pmax< Packet2i >(const Packet2i &a, const Packet2i &b)
Definition: NEON/PacketMath.h:1607
EIGEN_STRONG_INLINE Packet16c pmin< Packet16c >(const Packet16c &a, const Packet16c &b)
Definition: AltiVec/PacketMath.h:1273
EIGEN_STRONG_INLINE Packet8uc pload< Packet8uc >(const uint8_t *from)
Definition: NEON/PacketMath.h:2414
EIGEN_STRONG_INLINE Packet2ui pabsdiff< Packet2ui >(const Packet2ui &a, const Packet2ui &b)
Definition: NEON/PacketMath.h:1423
EIGEN_STRONG_INLINE Packet8us pand< Packet8us >(const Packet8us &a, const Packet8us &b)
Definition: AltiVec/PacketMath.h:1418
EIGEN_STRONG_INLINE Packet8c por< Packet8c >(const Packet8c &a, const Packet8c &b)
Definition: NEON/PacketMath.h:1984
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter< uint16_t, Packet4us >(uint16_t *to, const Packet4us &from, Index stride)
Definition: NEON/PacketMath.h:3103
EIGEN_STRONG_INLINE Packet2d shuffle(const Packet2d &m, const Packet2d &n, int mask)
Definition: LSX/PacketMath.h:150
eigen_packet_wrapper< uint32_t, 5 > Packet4uc
Definition: NEON/PacketMath.h:80
EIGEN_STRONG_INLINE void pstore< int8_t >(int8_t *to, const Packet16c &from)
Definition: LSX/PacketMath.h:1541
EIGEN_STRONG_INLINE Packet4uc plset< Packet4uc >(const uint8_t &a)
Definition: NEON/PacketMath.h:775
EIGEN_STRONG_INLINE Packet16c pcmp_le< Packet16c >(const Packet16c &a, const Packet16c &b)
Definition: LSX/PacketMath.h:1048
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c pgather< int8_t, Packet4c >(const int8_t *from, Index stride)
Definition: NEON/PacketMath.h:2842
__m128d Packet2d
Definition: LSX/PacketMath.h:36
EIGEN_STRONG_INLINE Packet4ui pset1< Packet4ui >(const uint32_t &from)
Definition: LSX/PacketMath.h:490
EIGEN_STRONG_INLINE void pstoreu< double >(double *to, const Packet4d &from)
Definition: AVX/PacketMath.h:1628
EIGEN_STRONG_INLINE Packet8s pmax< Packet8s >(const Packet8s &a, const Packet8s &b)
Definition: AltiVec/PacketMath.h:1297
EIGEN_STRONG_INLINE Packet8c pload< Packet8c >(const int8_t *from)
Definition: NEON/PacketMath.h:2400
EIGEN_STRONG_INLINE float predux< Packet2f >(const Packet2f &a)
Definition: NEON/PacketMath.h:3468
EIGEN_STRONG_INLINE Packet2ui psub< Packet2ui >(const Packet2ui &a, const Packet2ui &b)
Definition: NEON/PacketMath.h:973
EIGEN_STRONG_INLINE short int pfirst< Packet8s >(const Packet8s &a)
Definition: AltiVec/PacketMath.h:1883
EIGEN_STRONG_INLINE double predux< Packet2d >(const Packet2d &a)
Definition: LSX/PacketMath.h:1965
EIGEN_STRONG_INLINE void pstoreu< uint32_t >(uint32_t *to, const Packet8ui &from)
Definition: AVX/PacketMath.h:1636
EIGEN_STRONG_INLINE void prefetch< uint64_t >(const uint64_t *addr)
Definition: LSX/PacketMath.h:1868
EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf &a)
Definition: AltiVec/Complex.h:268
eigen_packet_wrapper< __m128i, 3 > Packet2l
Definition: LSX/PacketMath.h:41
EIGEN_STRONG_INLINE Packet4c psub< Packet4c >(const Packet4c &a, const Packet4c &b)
Definition: NEON/PacketMath.h:923
EIGEN_STRONG_INLINE Packet2l pdiv< Packet2l >(const Packet2l &a, const Packet2l &b)
Definition: LSX/PacketMath.h:794
EIGEN_STRONG_INLINE Packet8c psub< Packet8c >(const Packet8c &a, const Packet8c &b)
Definition: NEON/PacketMath.h:928
EIGEN_STRONG_INLINE Packet2ui pdiv< Packet2ui >(const Packet2ui &, const Packet2ui &)
Definition: NEON/PacketMath.h:1260
EIGEN_STRONG_INLINE Packet4s pcmp_lt< Packet4s >(const Packet4s &a, const Packet4s &b)
Definition: NEON/PacketMath.h:1753
EIGEN_STRONG_INLINE Packet16c pmax< Packet16c >(const Packet16c &a, const Packet16c &b)
Definition: AltiVec/PacketMath.h:1305
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pgather< uint8_t, Packet16uc >(const uint8_t *from, Index stride)
Definition: LSX/PacketMath.h:1676
EIGEN_STRONG_INLINE Packet4s pcmp_eq< Packet4s >(const Packet4s &a, const Packet4s &b)
Definition: NEON/PacketMath.h:1838
EIGEN_STRONG_INLINE Packet4us pset1< Packet4us >(const uint16_t &from)
Definition: NEON/PacketMath.h:709
EIGEN_STRONG_INLINE Packet2i pand< Packet2i >(const Packet2i &a, const Packet2i &b)
Definition: NEON/PacketMath.h:1947
EIGEN_STRONG_INLINE void prefetch< int8_t >(const int8_t *addr)
Definition: LSX/PacketMath.h:1840
EIGEN_STRONG_INLINE Packet8c pand< Packet8c >(const Packet8c &a, const Packet8c &b)
Definition: NEON/PacketMath.h:1911
EIGEN_STRONG_INLINE void prefetch< uint32_t >(const uint32_t *addr)
Definition: AVX/PacketMath.h:1758
EIGEN_STRONG_INLINE Packet2l pandnot< Packet2l >(const Packet2l &a, const Packet2l &b)
Definition: LSX/PacketMath.h:1019
EIGEN_STRONG_INLINE int64_t predux< Packet2l >(const Packet2l &a)
Definition: LSX/PacketMath.h:1987
EIGEN_DEVICE_FUNC Packet padd(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:318
EIGEN_STRONG_INLINE float predux_min< Packet2f >(const Packet2f &a)
Definition: NEON/PacketMath.h:3770
EIGEN_STRONG_INLINE Packet4us pmin< Packet4us >(const Packet4us &a, const Packet4us &b)
Definition: NEON/PacketMath.h:1498
EIGEN_STRONG_INLINE int8_t predux_min< Packet8c >(const Packet8c &a)
Definition: NEON/PacketMath.h:3797
EIGEN_STRONG_INLINE Packet4us por< Packet4us >(const Packet4us &a, const Packet4us &b)
Definition: NEON/PacketMath.h:2012
EIGEN_STRONG_INLINE Packet8c ploadu< Packet8c >(const int8_t *from)
Definition: NEON/PacketMath.h:2477
EIGEN_STRONG_INLINE Packet8us pabsdiff< Packet8us >(const Packet8us &a, const Packet8us &b)
Definition: LSX/PacketMath.h:2816
EIGEN_STRONG_INLINE Packet4f pmin< Packet4f >(const Packet4f &a, const Packet4f &b)
Definition: AltiVec/PacketMath.h:1250
EIGEN_STRONG_INLINE Packet2d padd< Packet2d >(const Packet2d &a, const Packet2d &b)
Definition: LSX/PacketMath.h:605
EIGEN_STRONG_INLINE Packet2d pandnot< Packet2d >(const Packet2d &a, const Packet2d &b)
Definition: LSX/PacketMath.h:1003
uint32x2_t Packet2ui
Definition: NEON/PacketMath.h:89
EIGEN_STRONG_INLINE Packet8c padd< Packet8c >(const Packet8c &a, const Packet8c &b)
Definition: NEON/PacketMath.h:853
EIGEN_STRONG_INLINE Packet8f Bf16ToF32(const Packet8bf &a)
Definition: AVX/PacketMath.h:2558
EIGEN_STRONG_INLINE Packet8f pzero(const Packet8f &)
Definition: AVX/PacketMath.h:774
EIGEN_STRONG_INLINE int32_t predux_mul< Packet2i >(const Packet2i &a)
Definition: NEON/PacketMath.h:3734
EIGEN_STRONG_INLINE Packet2f pand< Packet2f >(const Packet2f &a, const Packet2f &b)
Definition: NEON/PacketMath.h:1899
EIGEN_STRONG_INLINE Packet2i psub< Packet2i >(const Packet2i &a, const Packet2i &b)
Definition: NEON/PacketMath.h:965
EIGEN_STRONG_INLINE uint32_t predux_max< Packet4ui >(const Packet4ui &a)
Definition: LSX/PacketMath.h:2166
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter< uint64_t, Packet2ul >(uint64_t *to, const Packet2ul &from, Index stride)
Definition: LSX/PacketMath.h:1825
__vector int Packet4i
Definition: AltiVec/PacketMath.h:34
EIGEN_STRONG_INLINE Packet16uc ploadu< Packet16uc >(const unsigned char *from)
Definition: AltiVec/PacketMath.h:1557
EIGEN_STRONG_INLINE Packet4us ploadu< Packet4us >(const uint16_t *from)
Definition: NEON/PacketMath.h:2507
EIGEN_STRONG_INLINE Packet4bf pcmp_le< Packet4bf >(const Packet4bf &a, const Packet4bf &b)
Definition: NEON/PacketMath.h:5032
EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f &a, const Packet4f &b)
Definition: LSX/PacketMath.h:132
EIGEN_STRONG_INLINE Packet2f pset1< Packet2f >(const float &from)
Definition: NEON/PacketMath.h:669
EIGEN_STRONG_INLINE Packet2l ploadu< Packet2l >(const int64_t *from)
Definition: LSX/PacketMath.h:1464
EIGEN_STRONG_INLINE Packet8c pdiv< Packet8c >(const Packet8c &, const Packet8c &)
Definition: NEON/PacketMath.h:1205
EIGEN_STRONG_INLINE Packet2d pmin< PropagateNaN, Packet2d >(const Packet2d &a, const Packet2d &b)
Definition: LSX/PacketMath.h:2729
EIGEN_STRONG_INLINE Packet4f padd< Packet4f >(const Packet4f &a, const Packet4f &b)
Definition: AltiVec/PacketMath.h:1066
EIGEN_STRONG_INLINE Packet16uc pmul< Packet16uc >(const Packet16uc &a, const Packet16uc &b)
Definition: AltiVec/PacketMath.h:1182
EIGEN_STRONG_INLINE Packet4bf pround< Packet4bf >(const Packet4bf &a)
Definition: NEON/PacketMath.h:4938
EIGEN_STRONG_INLINE Packet4bf pset1< Packet4bf >(const bfloat16 &from)
Definition: NEON/PacketMath.h:4825
EIGEN_STRONG_INLINE Packet4i por< Packet4i >(const Packet4i &a, const Packet4i &b)
Definition: AltiVec/PacketMath.h:1431
EIGEN_STRONG_INLINE Packet2d pmax< PropagateNumbers, Packet2d >(const Packet2d &a, const Packet2d &b)
Definition: SSE/PacketMath.h:1149
int8x8_t Packet8c
Definition: NEON/PacketMath.h:78
EIGEN_STRONG_INLINE Packet4c ploaddup< Packet4c >(const int8_t *from)
Definition: NEON/PacketMath.h:2548
EIGEN_STRONG_INLINE Packet8c pcmp_eq< Packet8c >(const Packet8c &a, const Packet8c &b)
Definition: NEON/PacketMath.h:1817
EIGEN_STRONG_INLINE short int predux_min< Packet8s >(const Packet8s &a)
Definition: AltiVec/PacketMath.h:2617
EIGEN_STRONG_INLINE Packet16c pdiv< Packet16c >(const Packet16c &, const Packet16c &)
Definition: NEON/PacketMath.h:1210
EIGEN_STRONG_INLINE Packet16c por< Packet16c >(const Packet16c &a, const Packet16c &b)
Definition: LSX/PacketMath.h:925
EIGEN_STRONG_INLINE Packet4f pcmp_eq< Packet4f >(const Packet4f &a, const Packet4f &b)
Definition: LSX/PacketMath.h:1131
EIGEN_STRONG_INLINE Packet2ul pmin< Packet2ul >(const Packet2ul &a, const Packet2ul &b)
Definition: LSX/PacketMath.h:1200
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pgather< uint16_t, Packet8us >(const uint16_t *from, Index stride)
Definition: LSX/PacketMath.h:1697
EIGEN_STRONG_INLINE Packet2i pcmp_le< Packet2i >(const Packet2i &a, const Packet2i &b)
Definition: NEON/PacketMath.h:1684
EIGEN_STRONG_INLINE Packet4bf preverse< Packet4bf >(const Packet4bf &a)
Definition: NEON/PacketMath.h:5003
EIGEN_STRONG_INLINE Packet4ui pmul< Packet4ui >(const Packet4ui &a, const Packet4ui &b)
Definition: LSX/PacketMath.h:769
__vector unsigned char Packet16uc
Definition: AltiVec/PacketMath.h:41
EIGEN_STRONG_INLINE Packet4f vec4f_swizzle2(const Packet4f &a, const Packet4f &b, int p, int q, int r, int s)
Definition: LSX/PacketMath.h:129
EIGEN_STRONG_INLINE Packet8uc pandnot< Packet8uc >(const Packet8uc &a, const Packet8uc &b)
Definition: NEON/PacketMath.h:2142
EIGEN_STRONG_INLINE Packet4i pset1< Packet4i >(const int &from)
Definition: AltiVec/PacketMath.h:778
EIGEN_STRONG_INLINE Packet16c pload< Packet16c >(const signed char *from)
Definition: AltiVec/PacketMath.h:512
EIGEN_STRONG_INLINE Packet4f pabs< Packet4f >(const Packet4f &a)
Definition: ZVector/PacketMath.h:954
EIGEN_STRONG_INLINE Packet8us pmin< Packet8us >(const Packet8us &a, const Packet8us &b)
Definition: AltiVec/PacketMath.h:1269
EIGEN_STRONG_INLINE uint16_t predux_max< Packet4us >(const Packet4us &a)
Definition: NEON/PacketMath.h:4059
EIGEN_STRONG_INLINE Packet2d paddsub< Packet2d >(const Packet2d &a, const Packet2d &b)
Definition: LSX/PacketMath.h:661
EIGEN_STRONG_INLINE Packet4us pabsdiff< Packet4us >(const Packet4us &a, const Packet4us &b)
Definition: NEON/PacketMath.h:1407
EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f &m, const Packet4f &n, int mask)
Definition: LSX/PacketMath.h:105
EIGEN_STRONG_INLINE Packet4bf pmin< Packet4bf >(const Packet4bf &a, const Packet4bf &b)
Definition: NEON/PacketMath.h:4874
eigen_packet_wrapper< uint16x4_t, 19 > Packet4bf
Definition: NEON/PacketMath.h:4726
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter< int8_t, Packet4c >(int8_t *to, const Packet4c &from, Index stride)
Definition: NEON/PacketMath.h:3011
EIGEN_STRONG_INLINE int8_t pfirst< Packet4c >(const Packet4c &a)
Definition: NEON/PacketMath.h:3209
EIGEN_STRONG_INLINE Packet16c psub< Packet16c >(const Packet16c &a, const Packet16c &b)
Definition: AltiVec/PacketMath.h:1111
EIGEN_STRONG_INLINE Packet2i ploaddup< Packet2i >(const int32_t *from)
Definition: NEON/PacketMath.h:2602
EIGEN_STRONG_INLINE short int predux_max< Packet8s >(const Packet8s &a)
Definition: AltiVec/PacketMath.h:2697
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pgather< uint8_t, Packet8uc >(const uint8_t *from, Index stride)
Definition: NEON/PacketMath.h:2886
EIGEN_STRONG_INLINE Packet4f pcmp_le< Packet4f >(const Packet4f &a, const Packet4f &b)
Definition: LSX/PacketMath.h:1040
EIGEN_STRONG_INLINE Packet4bf pmax< PropagateNaN, Packet4bf >(const Packet4bf &a, const Packet4bf &b)
Definition: NEON/PacketMath.h:4883
EIGEN_STRONG_INLINE Packet4us pload< Packet4us >(const uint16_t *from)
Definition: NEON/PacketMath.h:2430
EIGEN_STRONG_INLINE Packet4c pload< Packet4c >(const int8_t *from)
Definition: NEON/PacketMath.h:2394
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter< int32_t, Packet2i >(int32_t *to, const Packet2i &from, Index stride)
Definition: NEON/PacketMath.h:3123
EIGEN_STRONG_INLINE unsigned short int predux_max< Packet8us >(const Packet8us &a)
Definition: AltiVec/PacketMath.h:2712
EIGEN_STRONG_INLINE Packet4c ploadquad< Packet4c >(const int8_t *from)
Definition: NEON/PacketMath.h:2631
EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d &a, const Packet2d &b)
Definition: LSX/PacketMath.h:161
EIGEN_STRONG_INLINE Packet8s pcmp_eq< Packet8s >(const Packet8s &a, const Packet8s &b)
Definition: LSX/PacketMath.h:1143
EIGEN_STRONG_INLINE Packet2ul pxor< Packet2ul >(const Packet2ul &a, const Packet2ul &b)
Definition: LSX/PacketMath.h:994
EIGEN_STRONG_INLINE float pfirst< Packet4f >(const Packet4f &a)
Definition: AltiVec/PacketMath.h:1863
EIGEN_STRONG_INLINE Packet2d pand< Packet2d >(const Packet2d &a, const Packet2d &b)
Definition: LSX/PacketMath.h:880
EIGEN_STRONG_INLINE Packet16uc pmax< Packet16uc >(const Packet16uc &a, const Packet16uc &b)
Definition: AltiVec/PacketMath.h:1309
EIGEN_STRONG_INLINE Packet4c pandnot< Packet4c >(const Packet4c &a, const Packet4c &b)
Definition: NEON/PacketMath.h:2126
EIGEN_STRONG_INLINE Packet4f ploadquad< Packet4f >(const float *from)
Definition: LSX/PacketMath.h:2703
EIGEN_STRONG_INLINE Packet4s pset1< Packet4s >(const int16_t &from)
Definition: NEON/PacketMath.h:701
EIGEN_STRONG_INLINE Packet4f shuffle1(const Packet4f &m, int mask)
Definition: LSX/PacketMath.h:97
EIGEN_STRONG_INLINE void prefetch< uint16_t >(const uint16_t *addr)
Definition: LSX/PacketMath.h:1860
EIGEN_STRONG_INLINE Packet2ul pandnot< Packet2ul >(const Packet2ul &a, const Packet2ul &b)
Definition: LSX/PacketMath.h:1035
EIGEN_STRONG_INLINE unsigned short int predux_min< Packet8us >(const Packet8us &a)
Definition: AltiVec/PacketMath.h:2632
EIGEN_STRONG_INLINE Packet8us pcmp_le< Packet8us >(const Packet8us &a, const Packet8us &b)
Definition: LSX/PacketMath.h:1068
EIGEN_STRONG_INLINE Packet4s pmin< Packet4s >(const Packet4s &a, const Packet4s &b)
Definition: NEON/PacketMath.h:1490
EIGEN_STRONG_INLINE Packet8s por< Packet8s >(const Packet8s &a, const Packet8s &b)
Definition: AltiVec/PacketMath.h:1435
EIGEN_STRONG_INLINE Packet8uc pcmp_lt< Packet8uc >(const Packet8uc &a, const Packet8uc &b)
Definition: NEON/PacketMath.h:1745
EIGEN_STRONG_INLINE void prefetch< int64_t >(const int64_t *addr)
Definition: LSX/PacketMath.h:1852
EIGEN_STRONG_INLINE Packet8us psub< Packet8us >(const Packet8us &a, const Packet8us &b)
Definition: AltiVec/PacketMath.h:1107
EIGEN_STRONG_INLINE Packet2f pcmp_eq< Packet2f >(const Packet2f &a, const Packet2f &b)
Definition: NEON/PacketMath.h:1804
EIGEN_STRONG_INLINE Packet4bf print< Packet4bf >(const Packet4bf &a)
Definition: NEON/PacketMath.h:4923
EIGEN_STRONG_INLINE Packet8c pxor< Packet8c >(const Packet8c &a, const Packet8c &b)
Definition: NEON/PacketMath.h:2057
EIGEN_STRONG_INLINE Packet4bf pmul< Packet4bf >(const Packet4bf &a, const Packet4bf &b)
Definition: NEON/PacketMath.h:4963
EIGEN_STRONG_INLINE Packet2ul pset1< Packet2ul >(const uint64_t &from)
Definition: LSX/PacketMath.h:494
EIGEN_STRONG_INLINE Packet4c pdiv< Packet4c >(const Packet4c &, const Packet4c &)
Definition: NEON/PacketMath.h:1200
EIGEN_STRONG_INLINE Packet4ui padd< Packet4ui >(const Packet4ui &a, const Packet4ui &b)
Definition: AltiVec/PacketMath.h:1074
EIGEN_STRONG_INLINE Packet8uc pxor< Packet8uc >(const Packet8uc &a, const Packet8uc &b)
Definition: NEON/PacketMath.h:2069
EIGEN_STRONG_INLINE void ptranspose(PacketBlock< Packet2cf, 2 > &kernel)
Definition: AltiVec/Complex.h:339
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pgather< int64_t, Packet2l >(const int64_t *from, Index stride)
Definition: LSX/PacketMath.h:1669
EIGEN_STRONG_INLINE Packet16c plset< Packet16c >(const signed char &a)
Definition: AltiVec/PacketMath.h:1057
EIGEN_STRONG_INLINE Packet4ui pand< Packet4ui >(const Packet4ui &a, const Packet4ui &b)
Definition: AltiVec/PacketMath.h:1414
EIGEN_STRONG_INLINE Packet4uc ploadquad< Packet4uc >(const uint8_t *from)
Definition: NEON/PacketMath.h:2648
EIGEN_STRONG_INLINE signed char pfirst< Packet16c >(const Packet16c &a)
Definition: AltiVec/PacketMath.h:1893
EIGEN_STRONG_INLINE Packet2ul pcmp_eq< Packet2ul >(const Packet2ul &a, const Packet2ul &b)
Definition: LSX/PacketMath.h:1167
EIGEN_STRONG_INLINE Packet4bf pmin< PropagateNaN, Packet4bf >(const Packet4bf &a, const Packet4bf &b)
Definition: NEON/PacketMath.h:4869
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pgather< int16_t, Packet8s >(const int16_t *from, Index stride)
Definition: LSX/PacketMath.h:1647
EIGEN_STRONG_INLINE uint8_t pfirst< Packet8uc >(const Packet8uc &a)
Definition: NEON/PacketMath.h:3225
EIGEN_STRONG_INLINE Packet16c pabsdiff< Packet16c >(const Packet16c &a, const Packet16c &b)
Definition: LSX/PacketMath.h:2758
EIGEN_STRONG_INLINE int32_t predux_min< Packet2i >(const Packet2i &a)
Definition: NEON/PacketMath.h:3901
eigen_packet_wrapper< int32_t, 2 > Packet4c
Definition: NEON/PacketMath.h:77
EIGEN_STRONG_INLINE Packet16uc pset1< Packet16uc >(const unsigned char &from)
Definition: AltiVec/PacketMath.h:798
EIGEN_STRONG_INLINE Packet4bf pmax< Packet4bf >(const Packet4bf &a, const Packet4bf &b)
Definition: NEON/PacketMath.h:4888
EIGEN_STRONG_INLINE Packet4uc pdiv< Packet4uc >(const Packet4uc &, const Packet4uc &)
Definition: NEON/PacketMath.h:1215
EIGEN_STRONG_INLINE signed char predux_mul< Packet16c >(const Packet16c &a)
Definition: AltiVec/PacketMath.h:2566
EIGEN_STRONG_INLINE Packet4i ploaddup< Packet4i >(const int *from)
Definition: AltiVec/PacketMath.h:1644
EIGEN_STRONG_INLINE bool predux_any(const Packet4f &x)
Definition: AltiVec/PacketMath.h:2751
EIGEN_STRONG_INLINE uint32_t predux_mul< Packet2ui >(const Packet2ui &a)
Definition: NEON/PacketMath.h:3742
EIGEN_STRONG_INLINE Packet4us pcmp_le< Packet4us >(const Packet4us &a, const Packet4us &b)
Definition: NEON/PacketMath.h:1676
EIGEN_STRONG_INLINE Packet4s padd< Packet4s >(const Packet4s &a, const Packet4s &b)
Definition: NEON/PacketMath.h:874
EIGEN_STRONG_INLINE Packet4s pand< Packet4s >(const Packet4s &a, const Packet4s &b)
Definition: NEON/PacketMath.h:1931
EIGEN_STRONG_INLINE float predux_max< Packet4f >(const Packet4f &a)
Definition: AltiVec/PacketMath.h:2679
EIGEN_STRONG_INLINE signed char predux_min< Packet16c >(const Packet16c &a)
Definition: AltiVec/PacketMath.h:2647
EIGEN_STRONG_INLINE Packet4bf padd< Packet4bf >(const Packet4bf &a, const Packet4bf &b)
Definition: NEON/PacketMath.h:4953
EIGEN_STRONG_INLINE Packet8us pdiv< Packet8us >(const Packet8us &a, const Packet8us &b)
Definition: LSX/PacketMath.h:798
EIGEN_STRONG_INLINE Packet2ui ploadu< Packet2ui >(const uint32_t *from)
Definition: NEON/PacketMath.h:2523
EIGEN_STRONG_INLINE Packet2ul ploaddup< Packet2ul >(const uint64_t *from)
Definition: LSX/PacketMath.h:1528
EIGEN_STRONG_INLINE Packet2d ploaddup< Packet2d >(const double *from)
Definition: LSX/PacketMath.h:1490
EIGEN_STRONG_INLINE Packet8us plset< Packet8us >(const unsigned short int &a)
Definition: AltiVec/PacketMath.h:1053
__vector unsigned short int Packet8us
Definition: AltiVec/PacketMath.h:38
EIGEN_STRONG_INLINE Packet2f pset1frombits< Packet2f >(uint32_t from)
Definition: NEON/PacketMath.h:742
EIGEN_STRONG_INLINE Packet2l pcmp_eq< Packet2l >(const Packet2l &a, const Packet2l &b)
Definition: LSX/PacketMath.h:1151
EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f &a, const Packet4f &b)
Definition: LSX/PacketMath.h:135
EIGEN_STRONG_INLINE Packet2d pxor< Packet2d >(const Packet2d &a, const Packet2d &b)
Definition: LSX/PacketMath.h:962
EIGEN_STRONG_INLINE uint8_t predux_max< Packet4uc >(const Packet4uc &a)
Definition: NEON/PacketMath.h:3983
EIGEN_STRONG_INLINE uint32_t predux_min< Packet2ui >(const Packet2ui &a)
Definition: NEON/PacketMath.h:3910
EIGEN_DEVICE_FUNC Packet pdiv(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:368
EIGEN_STRONG_INLINE Packet2d por< Packet2d >(const Packet2d &a, const Packet2d &b)
Definition: LSX/PacketMath.h:921
EIGEN_STRONG_INLINE Packet2ul pload< Packet2ul >(const uint64_t *from)
Definition: LSX/PacketMath.h:1439
EIGEN_STRONG_INLINE Packet4f shuffle2< true >(const Packet4f &m, const Packet4f &n, int mask)
Definition: LSX/PacketMath.h:114
EIGEN_STRONG_INLINE void pstore< bfloat16 >(bfloat16 *to, const Packet8bf &from)
Definition: AltiVec/PacketMath.h:662
EIGEN_STRONG_INLINE Packet2d pldexp< Packet2d >(const Packet2d &a, const Packet2d &exponent)
Definition: LSX/PacketMath.h:2753
EIGEN_STRONG_INLINE Packet2f pmul< Packet2f >(const Packet2f &a, const Packet2f &b)
Definition: NEON/PacketMath.h:1123
EIGEN_STRONG_INLINE Packet2f pmax< PropagateNaN, Packet2f >(const Packet2f &a, const Packet2f &b)
Definition: NEON/PacketMath.h:1560
EIGEN_STRONG_INLINE int16_t predux_mul< Packet4s >(const Packet4s &a)
Definition: NEON/PacketMath.h:3702
EIGEN_STRONG_INLINE Packet2ui pand< Packet2ui >(const Packet2ui &a, const Packet2ui &b)
Definition: NEON/PacketMath.h:1955
EIGEN_STRONG_INLINE Packet4us pmax< Packet4us >(const Packet4us &a, const Packet4us &b)
Definition: NEON/PacketMath.h:1599
EIGEN_STRONG_INLINE Packet4i pdiv< Packet4i >(const Packet4i &a, const Packet4i &b)
Definition: AltiVec/PacketMath.h:1205
EIGEN_STRONG_INLINE Packet2l padd< Packet2l >(const Packet2l &a, const Packet2l &b)
Definition: LSX/PacketMath.h:581
EIGEN_STRONG_INLINE Packet8s ploadu< Packet8s >(const short int *from)
Definition: AltiVec/PacketMath.h:1541
EIGEN_STRONG_INLINE Packet4f pmax< PropagateNumbers, Packet4f >(const Packet4f &a, const Packet4f &b)
Definition: SSE/PacketMath.h:1145
EIGEN_STRONG_INLINE Packet4ui pcmp_le< Packet4ui >(const Packet4ui &a, const Packet4ui &b)
Definition: LSX/PacketMath.h:1072
EIGEN_STRONG_INLINE Packet2f pmin< PropagateNaN, Packet2f >(const Packet2f &a, const Packet2f &b)
Definition: NEON/PacketMath.h:1459
EIGEN_STRONG_INLINE uint16_t predux_min< Packet4us >(const Packet4us &a)
Definition: NEON/PacketMath.h:3889
EIGEN_STRONG_INLINE Packet2ui padd< Packet2ui >(const Packet2ui &a, const Packet2ui &b)
Definition: NEON/PacketMath.h:898
EIGEN_STRONG_INLINE Packet2l pcmp_lt< Packet2l >(const Packet2l &a, const Packet2l &b)
Definition: LSX/PacketMath.h:1101
EIGEN_STRONG_INLINE Packet4uc pmul< Packet4uc >(const Packet4uc &a, const Packet4uc &b)
Definition: NEON/PacketMath.h:1144
EIGEN_STRONG_INLINE Packet16uc pdiv< Packet16uc >(const Packet16uc &a, const Packet16uc &b)
Definition: LSX/PacketMath.h:2789
EIGEN_STRONG_INLINE Packet4c pabsdiff< Packet4c >(const Packet4c &a, const Packet4c &b)
Definition: NEON/PacketMath.h:1373
EIGEN_STRONG_INLINE Packet4f ploaddup< Packet4f >(const float *from)
Definition: AltiVec/PacketMath.h:1640
EIGEN_STRONG_INLINE Packet4f por< Packet4f >(const Packet4f &a, const Packet4f &b)
Definition: AltiVec/PacketMath.h:1427
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter< uint8_t, Packet8uc >(uint8_t *to, const Packet8uc &from, Index stride)
Definition: NEON/PacketMath.h:3051
EIGEN_STRONG_INLINE Packet2l por< Packet2l >(const Packet2l &a, const Packet2l &b)
Definition: LSX/PacketMath.h:937
EIGEN_STRONG_INLINE void prefetch< int32_t >(const int32_t *addr)
Definition: LSX/PacketMath.h:1848
EIGEN_STRONG_INLINE Packet8uc pset1< Packet8uc >(const uint8_t &from)
Definition: NEON/PacketMath.h:693
EIGEN_STRONG_INLINE Packet2d vec2d_swizzle2(const Packet2d &a, const Packet2d &b, int mask)
Definition: LSX/PacketMath.h:157
EIGEN_STRONG_INLINE Packet4ui pabsdiff< Packet4ui >(const Packet4ui &a, const Packet4ui &b)
Definition: LSX/PacketMath.h:2838
EIGEN_STRONG_INLINE Packet2i pmin< Packet2i >(const Packet2i &a, const Packet2i &b)
Definition: NEON/PacketMath.h:1506
EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i &a)
Definition: AltiVec/PacketMath.h:1983
EIGEN_STRONG_INLINE Packet8s plset< Packet8s >(const short int &a)
Definition: AltiVec/PacketMath.h:1049
EIGEN_STRONG_INLINE Packet4c pcmp_eq< Packet4c >(const Packet4c &a, const Packet4c &b)
Definition: NEON/PacketMath.h:1812
EIGEN_STRONG_INLINE Packet16uc padd< Packet16uc >(const Packet16uc &a, const Packet16uc &b)
Definition: AltiVec/PacketMath.h:1090
EIGEN_STRONG_INLINE int predux_min< Packet4i >(const Packet4i &a)
Definition: AltiVec/PacketMath.h:2604
EIGEN_STRONG_INLINE Packet2ui pload< Packet2ui >(const uint32_t *from)
Definition: NEON/PacketMath.h:2446
EIGEN_STRONG_INLINE Packet16uc psub< Packet16uc >(const Packet16uc &a, const Packet16uc &b)
Definition: AltiVec/PacketMath.h:1115
EIGEN_STRONG_INLINE Packet8uc plset< Packet8uc >(const uint8_t &a)
Definition: NEON/PacketMath.h:779
EIGEN_STRONG_INLINE Packet2f pcmp_lt_or_nan< Packet2f >(const Packet2f &a, const Packet2f &b)
Definition: NEON/PacketMath.h:1889
EIGEN_STRONG_INLINE Packet8c pmul< Packet8c >(const Packet8c &a, const Packet8c &b)
Definition: NEON/PacketMath.h:1136
EIGEN_STRONG_INLINE Packet4bf ploadu< Packet4bf >(const bfloat16 *from)
Definition: NEON/PacketMath.h:4840
EIGEN_STRONG_INLINE Packet8c pabsdiff< Packet8c >(const Packet8c &a, const Packet8c &b)
Definition: NEON/PacketMath.h:1378
EIGEN_STRONG_INLINE Packet4i pxor< Packet4i >(const Packet4i &a, const Packet4i &b)
Definition: AltiVec/PacketMath.h:1452
EIGEN_STRONG_INLINE Packet2i padd< Packet2i >(const Packet2i &a, const Packet2i &b)
Definition: NEON/PacketMath.h:890
EIGEN_STRONG_INLINE double predux_max< Packet2d >(const Packet2d &a)
Definition: LSX/PacketMath.h:2127
EIGEN_STRONG_INLINE int8_t predux_max< Packet8c >(const Packet8c &a)
Definition: NEON/PacketMath.h:3967
EIGEN_STRONG_INLINE Packet4c pcmp_lt< Packet4c >(const Packet4c &a, const Packet4c &b)
Definition: NEON/PacketMath.h:1727
EIGEN_STRONG_INLINE Packet4f pmul< Packet4f >(const Packet4f &a, const Packet4f &b)
Definition: AltiVec/PacketMath.h:1162
EIGEN_STRONG_INLINE Packet8us pcmp_lt< Packet8us >(const Packet8us &a, const Packet8us &b)
Definition: LSX/PacketMath.h:1109
EIGEN_STRONG_INLINE Packet2f pcmp_lt< Packet2f >(const Packet2f &a, const Packet2f &b)
Definition: NEON/PacketMath.h:1719
EIGEN_STRONG_INLINE Packet4s pxor< Packet4s >(const Packet4s &a, const Packet4s &b)
Definition: NEON/PacketMath.h:2077
EIGEN_STRONG_INLINE Packet2l pcmp_le< Packet2l >(const Packet2l &a, const Packet2l &b)
Definition: LSX/PacketMath.h:1060
EIGEN_STRONG_INLINE uint32_t predux< Packet4ui >(const Packet4ui &a)
Definition: LSX/PacketMath.h:2004
EIGEN_STRONG_INLINE Packet4i pcmp_eq< Packet4i >(const Packet4i &a, const Packet4i &b)
Definition: LSX/PacketMath.h:1147
EIGEN_STRONG_INLINE Packet4ui por< Packet4ui >(const Packet4ui &a, const Packet4ui &b)
Definition: LSX/PacketMath.h:949
EIGEN_STRONG_INLINE Packet4c plset< Packet4c >(const int8_t &a)
Definition: NEON/PacketMath.h:761
EIGEN_STRONG_INLINE Packet8uc ploaddup< Packet8uc >(const uint8_t *from)
Definition: NEON/PacketMath.h:2569
EIGEN_STRONG_INLINE Packet4ui pmin< Packet4ui >(const Packet4ui &a, const Packet4ui &b)
Definition: LSX/PacketMath.h:1196
EIGEN_STRONG_INLINE Packet4c pmin< Packet4c >(const Packet4c &a, const Packet4c &b)
Definition: NEON/PacketMath.h:1464
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4f pgather< float, Packet4f >(const float *from, Index stride)
Definition: AltiVec/PacketMath.h:853
EIGEN_STRONG_INLINE Packet8us pmax< Packet8us >(const Packet8us &a, const Packet8us &b)
Definition: AltiVec/PacketMath.h:1301
EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f &a, const Packet4f &b)
Definition: AltiVec/PacketMath.h:1314
EIGEN_STRONG_INLINE Packet4f paddsub< Packet4f >(const Packet4f &a, const Packet4f &b)
Definition: LSX/PacketMath.h:653
EIGEN_STRONG_INLINE Packet8c plset< Packet8c >(const int8_t &a)
Definition: NEON/PacketMath.h:765
EIGEN_STRONG_INLINE Packet2d pset1< Packet2d >(const double &from)
Definition: LSX/PacketMath.h:503
EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i &a)
Definition: AltiVec/PacketMath.h:1979
EIGEN_STRONG_INLINE Packet4uc pcmp_eq< Packet4uc >(const Packet4uc &a, const Packet4uc &b)
Definition: NEON/PacketMath.h:1825
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pgather< uint16_t, Packet4us >(const uint16_t *from, Index stride)
Definition: NEON/PacketMath.h:2938
EIGEN_STRONG_INLINE Packet2ui pcmp_eq< Packet2ui >(const Packet2ui &a, const Packet2ui &b)
Definition: NEON/PacketMath.h:1862
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter< uint32_t, Packet2ui >(uint32_t *to, const Packet2ui &from, Index stride)
Definition: NEON/PacketMath.h:3137
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter< uint32_t, Packet4ui >(uint32_t *to, const Packet4ui &from, Index stride)
Definition: LSX/PacketMath.h:1817
EIGEN_STRONG_INLINE unsigned short int predux< Packet8us >(const Packet8us &a)
Definition: AltiVec/PacketMath.h:2483
EIGEN_STRONG_INLINE Packet4f pload< Packet4f >(const float *from)
Definition: AltiVec/PacketMath.h:492
__vector signed char Packet16c
Definition: AltiVec/PacketMath.h:40
EIGEN_STRONG_INLINE Packet16uc ploadquad< Packet16uc >(const unsigned char *from)
Definition: AltiVec/PacketMath.h:1724
EIGEN_STRONG_INLINE int predux_mul< Packet4i >(const Packet4i &a)
Definition: AltiVec/PacketMath.h:2529
EIGEN_STRONG_INLINE void pstoreu< uint16_t >(uint16_t *to, const Packet8us &from)
Definition: LSX/PacketMath.h:1603
EIGEN_STRONG_INLINE Packet4uc pload< Packet4uc >(const uint8_t *from)
Definition: NEON/PacketMath.h:2408
EIGEN_STRONG_INLINE Packet4us plset< Packet4us >(const uint16_t &a)
Definition: NEON/PacketMath.h:794
EIGEN_STRONG_INLINE Packet16uc pload< Packet16uc >(const unsigned char *from)
Definition: AltiVec/PacketMath.h:517
EIGEN_STRONG_INLINE Packet4us ploaddup< Packet4us >(const uint16_t *from)
Definition: NEON/PacketMath.h:2591
EIGEN_STRONG_INLINE Packet4s por< Packet4s >(const Packet4s &a, const Packet4s &b)
Definition: NEON/PacketMath.h:2004
EIGEN_STRONG_INLINE Packet8us ploadu< Packet8us >(const unsigned short int *from)
Definition: AltiVec/PacketMath.h:1545
EIGEN_STRONG_INLINE Packet4c por< Packet4c >(const Packet4c &a, const Packet4c &b)
Definition: NEON/PacketMath.h:1980
EIGEN_STRONG_INLINE Packet2f pldexp< Packet2f >(const Packet2f &a, const Packet2f &exponent)
Definition: NEON/PacketMath.h:3449
EIGEN_STRONG_INLINE int16_t predux< Packet4s >(const Packet4s &a)
Definition: NEON/PacketMath.h:3583
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter< int32_t, Packet4i >(int32_t *to, const Packet4i &from, Index stride)
Definition: LSX/PacketMath.h:1771
EIGEN_STRONG_INLINE Packet8h por(const Packet8h &a, const Packet8h &b)
Definition: AVX/PacketMath.h:2309
EIGEN_STRONG_INLINE Packet2f pmin< Packet2f >(const Packet2f &a, const Packet2f &b)
Definition: NEON/PacketMath.h:1432
EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i &a, const Packet4i &b)
Definition: AltiVec/PacketMath.h:1341
EIGEN_STRONG_INLINE Packet2ui pmin< Packet2ui >(const Packet2ui &a, const Packet2ui &b)
Definition: NEON/PacketMath.h:1514
EIGEN_STRONG_INLINE Packet8us pmul< Packet8us >(const Packet8us &a, const Packet8us &b)
Definition: AltiVec/PacketMath.h:1174
EIGEN_STRONG_INLINE uint64_t predux_max< Packet2ul >(const Packet2ul &a)
Definition: LSX/PacketMath.h:2171
EIGEN_STRONG_INLINE Packet8s pand< Packet8s >(const Packet8s &a, const Packet8s &b)
Definition: LSX/PacketMath.h:888
__vector unsigned int Packet4ui
Definition: AltiVec/PacketMath.h:35
EIGEN_STRONG_INLINE Packet2d pmax< PropagateNaN, Packet2d >(const Packet2d &a, const Packet2d &b)
Definition: LSX/PacketMath.h:2733
EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan< Packet4f >(const Packet4f &a, const Packet4f &b)
Definition: LSX/PacketMath.h:1122
EIGEN_STRONG_INLINE Packet4f pmin< PropagateNaN, Packet4f >(const Packet4f &a, const Packet4f &b)
Definition: LSX/PacketMath.h:2695
EIGEN_STRONG_INLINE Packet2i pload< Packet2i >(const int32_t *from)
Definition: NEON/PacketMath.h:2438
EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf &a)
Definition: AltiVec/Complex.h:303
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter< int16_t, Packet4s >(int16_t *to, const Packet4s &from, Index stride)
Definition: NEON/PacketMath.h:3083
EIGEN_STRONG_INLINE Packet2ui pmax< Packet2ui >(const Packet2ui &a, const Packet2ui &b)
Definition: NEON/PacketMath.h:1615
EIGEN_STRONG_INLINE Packet8s pcmp_le< Packet8s >(const Packet8s &a, const Packet8s &b)
Definition: LSX/PacketMath.h:1052
EIGEN_STRONG_INLINE uint16_t predux< Packet4us >(const Packet4us &a)
Definition: NEON/PacketMath.h:3595
EIGEN_STRONG_INLINE void pstore< double >(double *to, const Packet4d &from)
Definition: AVX/PacketMath.h:1611
EIGEN_STRONG_INLINE Packet4i padd< Packet4i >(const Packet4i &a, const Packet4i &b)
Definition: AltiVec/PacketMath.h:1070
EIGEN_STRONG_INLINE Packet4f pfloor< Packet4f >(const Packet4f &a)
Definition: AltiVec/PacketMath.h:1497
EIGEN_STRONG_INLINE Packet16c pcmp_eq< Packet16c >(const Packet16c &a, const Packet16c &b)
Definition: LSX/PacketMath.h:1139
EIGEN_STRONG_INLINE uint32_t pfirst< Packet4ui >(const Packet4ui &a)
Definition: LSX/PacketMath.h:1910
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter< float, Packet2f >(float *to, const Packet2f &from, Index stride)
Definition: NEON/PacketMath.h:2999
EIGEN_STRONG_INLINE Packet4c pand< Packet4c >(const Packet4c &a, const Packet4c &b)
Definition: NEON/PacketMath.h:1907
EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
Definition: AltiVec/PacketMath.h:1218
EIGEN_STRONG_INLINE Packet8c pcmp_le< Packet8c >(const Packet8c &a, const Packet8c &b)
Definition: NEON/PacketMath.h:1647
EIGEN_STRONG_INLINE Packet16c pcmp_lt< Packet16c >(const Packet16c &a, const Packet16c &b)
Definition: LSX/PacketMath.h:1089
EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf &a, const Packet4cf &b)
Definition: AVX/Complex.h:88
EIGEN_DEVICE_FUNC Packet preciprocal(const Packet &a)
Definition: GenericPacketMath.h:1433
EIGEN_STRONG_INLINE Packet4us padd< Packet4us >(const Packet4us &a, const Packet4us &b)
Definition: NEON/PacketMath.h:882
EIGEN_STRONG_INLINE Packet4us pmul< Packet4us >(const Packet4us &a, const Packet4us &b)
Definition: NEON/PacketMath.h:1165
EIGEN_STRONG_INLINE Packet8s pdiv< Packet8s >(const Packet8s &a, const Packet8s &b)
Definition: LSX/PacketMath.h:786
EIGEN_STRONG_INLINE Packet4i pandnot< Packet4i >(const Packet4i &a, const Packet4i &b)
Definition: AltiVec/PacketMath.h:1469
EIGEN_STRONG_INLINE unsigned char predux_max< Packet16uc >(const Packet16uc &a)
Definition: AltiVec/PacketMath.h:2739
EIGEN_STRONG_INLINE signed char predux_max< Packet16c >(const Packet16c &a)
Definition: AltiVec/PacketMath.h:2727
EIGEN_STRONG_INLINE uint8_t predux_min< Packet8uc >(const Packet8uc &a)
Definition: NEON/PacketMath.h:3862
EIGEN_STRONG_INLINE Packet8us pandnot< Packet8us >(const Packet8us &a, const Packet8us &b)
Definition: LSX/PacketMath.h:1027
EIGEN_STRONG_INLINE Packet2ul pmul< Packet2ul >(const Packet2ul &a, const Packet2ul &b)
Definition: LSX/PacketMath.h:773
EIGEN_STRONG_INLINE Packet4ui pandnot< Packet4ui >(const Packet4ui &a, const Packet4ui &b)
Definition: LSX/PacketMath.h:1031
EIGEN_STRONG_INLINE Packet8s ploaddup< Packet8s >(const short int *from)
Definition: AltiVec/PacketMath.h:1649
EIGEN_STRONG_INLINE Packet4bf F32MaskToBf16Mask(const Packet4f &p)
Definition: NEON/PacketMath.h:4822
EIGEN_STRONG_INLINE uint32_t predux< Packet2ui >(const Packet2ui &a)
Definition: NEON/PacketMath.h:3616
EIGEN_STRONG_INLINE Packet2ul pmax< Packet2ul >(const Packet2ul &a, const Packet2ul &b)
Definition: LSX/PacketMath.h:1233
EIGEN_STRONG_INLINE Packet4f pdiv< Packet4f >(const Packet4f &a, const Packet4f &b)
Definition: AltiVec/PacketMath.h:1187
EIGEN_STRONG_INLINE Packet4uc pmin< Packet4uc >(const Packet4uc &a, const Packet4uc &b)
Definition: NEON/PacketMath.h:1477
EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h &a, const Packet8h &b)
Definition: AVX/PacketMath.h:2323
EIGEN_STRONG_INLINE Packet8uc pmax< Packet8uc >(const Packet8uc &a, const Packet8uc &b)
Definition: NEON/PacketMath.h:1583
EIGEN_STRONG_INLINE Packet2d pload< Packet2d >(const double *from)
Definition: LSX/PacketMath.h:1407
EIGEN_STRONG_INLINE Packet4uc pcmp_lt< Packet4uc >(const Packet4uc &a, const Packet4uc &b)
Definition: NEON/PacketMath.h:1740
EIGEN_STRONG_INLINE Packet8uc pmul< Packet8uc >(const Packet8uc &a, const Packet8uc &b)
Definition: NEON/PacketMath.h:1149
EIGEN_STRONG_INLINE bfloat16 predux_max< Packet4bf >(const Packet4bf &a)
Definition: NEON/PacketMath.h:4988
EIGEN_STRONG_INLINE Packet8us pload< Packet8us >(const unsigned short int *from)
Definition: AltiVec/PacketMath.h:507
EIGEN_STRONG_INLINE Packet16uc pcmp_lt< Packet16uc >(const Packet16uc &a, const Packet16uc &b)
Definition: LSX/PacketMath.h:1105
EIGEN_STRONG_INLINE Packet4c pcmp_le< Packet4c >(const Packet4c &a, const Packet4c &b)
Definition: NEON/PacketMath.h:1642
EIGEN_STRONG_INLINE Packet2d pmul< Packet2d >(const Packet2d &a, const Packet2d &b)
Definition: LSX/PacketMath.h:741
EIGEN_STRONG_INLINE Packet4i pabsdiff< Packet4i >(const Packet4i &a, const Packet4i &b)
Definition: LSX/PacketMath.h:2774
EIGEN_STRONG_INLINE Packet8uc psub< Packet8uc >(const Packet8uc &a, const Packet8uc &b)
Definition: NEON/PacketMath.h:941
EIGEN_STRONG_INLINE bfloat16 pfirst< Packet4bf >(const Packet4bf &from)
Definition: NEON/PacketMath.h:4830
EIGEN_STRONG_INLINE Packet4s pmax< Packet4s >(const Packet4s &a, const Packet4s &b)
Definition: NEON/PacketMath.h:1591
EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf &a)
Definition: AltiVec/Complex.h:264
EIGEN_STRONG_INLINE Packet2ui pandnot< Packet2ui >(const Packet2ui &a, const Packet2ui &b)
Definition: NEON/PacketMath.h:2174
EIGEN_STRONG_INLINE Packet4f pfrexp< Packet4f >(const Packet4f &a, Packet4f &exponent)
Definition: AltiVec/PacketMath.h:2328
EIGEN_STRONG_INLINE float predux_mul< Packet4f >(const Packet4f &a)
Definition: AltiVec/PacketMath.h:2522
EIGEN_STRONG_INLINE Packet2ul plset< Packet2ul >(const uint64_t &a)
Definition: LSX/PacketMath.h:553
EIGEN_STRONG_INLINE Packet2d pmin< PropagateNumbers, Packet2d >(const Packet2d &a, const Packet2d &b)
Definition: SSE/PacketMath.h:1141
EIGEN_STRONG_INLINE Packet2f pandnot< Packet2f >(const Packet2f &a, const Packet2f &b)
Definition: NEON/PacketMath.h:2118
EIGEN_STRONG_INLINE Packet2i pcmp_lt< Packet2i >(const Packet2i &a, const Packet2i &b)
Definition: NEON/PacketMath.h:1769
EIGEN_STRONG_INLINE Packet8uc pcmp_le< Packet8uc >(const Packet8uc &a, const Packet8uc &b)
Definition: NEON/PacketMath.h:1660
EIGEN_STRONG_INLINE Packet2f pmax< Packet2f >(const Packet2f &a, const Packet2f &b)
Definition: NEON/PacketMath.h:1533
EIGEN_STRONG_INLINE void prefetch< float >(const float *addr)
Definition: AltiVec/PacketMath.h:1854
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter< int16_t, Packet8s >(int16_t *to, const Packet8s &from, Index stride)
Definition: LSX/PacketMath.h:1759
EIGEN_STRONG_INLINE void pstoreu< bfloat16 >(bfloat16 *to, const Packet8bf &from)
Definition: AltiVec/PacketMath.h:1772
EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i &a)
Definition: AltiVec/PacketMath.h:1975
EIGEN_STRONG_INLINE Packet4us pcmp_eq< Packet4us >(const Packet4us &a, const Packet4us &b)
Definition: NEON/PacketMath.h:1846
EIGEN_STRONG_INLINE Packet4us pxor< Packet4us >(const Packet4us &a, const Packet4us &b)
Definition: NEON/PacketMath.h:2085
EIGEN_STRONG_INLINE int32_t pfirst< Packet2i >(const Packet2i &a)
Definition: NEON/PacketMath.h:3249
EIGEN_STRONG_INLINE Packet4ui pcmp_lt< Packet4ui >(const Packet4ui &a, const Packet4ui &b)
Definition: LSX/PacketMath.h:1113
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter< int64_t, Packet2l >(int64_t *to, const Packet2l &from, Index stride)
Definition: LSX/PacketMath.h:1779
EIGEN_STRONG_INLINE Packet8s padd< Packet8s >(const Packet8s &a, const Packet8s &b)
Definition: AltiVec/PacketMath.h:1078
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter< double, Packet2d >(double *to, const Packet2d &from, Index stride)
Definition: LSX/PacketMath.h:1734
EIGEN_STRONG_INLINE Packet4uc psub< Packet4uc >(const Packet4uc &a, const Packet4uc &b)
Definition: NEON/PacketMath.h:936
EIGEN_STRONG_INLINE bfloat16 predux_min< Packet4bf >(const Packet4bf &a)
Definition: NEON/PacketMath.h:4993
EIGEN_STRONG_INLINE Packet4ui pload< Packet4ui >(const uint32_t *from)
Definition: LSX/PacketMath.h:1435
EIGEN_STRONG_INLINE Packet4f prsqrt_float_unsafe(const Packet4f &a)
Definition: NEON/PacketMath.h:4591
EIGEN_STRONG_INLINE Packet2i pmul< Packet2i >(const Packet2i &a, const Packet2i &b)
Definition: NEON/PacketMath.h:1173
EIGEN_STRONG_INLINE Packet16uc pandnot< Packet16uc >(const Packet16uc &a, const Packet16uc &b)
Definition: LSX/PacketMath.h:1023
EIGEN_STRONG_INLINE Packet8us ploaddup< Packet8us >(const unsigned short int *from)
Definition: AltiVec/PacketMath.h:1659
EIGEN_STRONG_INLINE Packet4f pmax< PropagateNaN, Packet4f >(const Packet4f &a, const Packet4f &b)
Definition: LSX/PacketMath.h:2699
EIGEN_STRONG_INLINE Packet4i ploadu< Packet4i >(const int *from)
Definition: AltiVec/PacketMath.h:1537
EIGEN_STRONG_INLINE double predux_mul< Packet2d >(const Packet2d &a)
Definition: LSX/PacketMath.h:2019
__vector short int Packet8s
Definition: AltiVec/PacketMath.h:37
EIGEN_STRONG_INLINE Packet2d pdiv< Packet2d >(const Packet2d &a, const Packet2d &b)
Definition: LSX/PacketMath.h:782
EIGEN_STRONG_INLINE Packet2ul ploadu< Packet2ul >(const uint64_t *from)
Definition: LSX/PacketMath.h:1480
EIGEN_STRONG_INLINE Packet2ui pcmp_lt< Packet2ui >(const Packet2ui &a, const Packet2ui &b)
Definition: NEON/PacketMath.h:1777
EIGEN_STRONG_INLINE Packet8bf psignbit(const Packet8bf &a)
Definition: AltiVec/PacketMath.h:1966
EIGEN_STRONG_INLINE Packet2i pset1< Packet2i >(const int32_t &from)
Definition: NEON/PacketMath.h:717
EIGEN_STRONG_INLINE Packet4uc pabsdiff< Packet4uc >(const Packet4uc &a, const Packet4uc &b)
Definition: NEON/PacketMath.h:1386
EIGEN_STRONG_INLINE Packet4c pset1< Packet4c >(const int8_t &from)
Definition: NEON/PacketMath.h:677
EIGEN_STRONG_INLINE uint64_t predux_min< Packet2ul >(const Packet2ul &a)
Definition: LSX/PacketMath.h:2117
EIGEN_STRONG_INLINE Packet4s pandnot< Packet4s >(const Packet4s &a, const Packet4s &b)
Definition: NEON/PacketMath.h:2150
EIGEN_STRONG_INLINE Packet2i pdiv< Packet2i >(const Packet2i &, const Packet2i &)
Definition: NEON/PacketMath.h:1250
EIGEN_STRONG_INLINE double predux_min< Packet2d >(const Packet2d &a)
Definition: LSX/PacketMath.h:2073
EIGEN_STRONG_INLINE Packet8c pcmp_lt< Packet8c >(const Packet8c &a, const Packet8c &b)
Definition: NEON/PacketMath.h:1732
EIGEN_STRONG_INLINE Packet4s ploadu< Packet4s >(const int16_t *from)
Definition: NEON/PacketMath.h:2499
EIGEN_STRONG_INLINE Packet4f pset1< Packet4f >(const float &from)
Definition: AltiVec/PacketMath.h:773
EIGEN_STRONG_INLINE Packet2ui por< Packet2ui >(const Packet2ui &a, const Packet2ui &b)
Definition: NEON/PacketMath.h:2028
EIGEN_STRONG_INLINE Packet4us pand< Packet4us >(const Packet4us &a, const Packet4us &b)
Definition: NEON/PacketMath.h:1939
EIGEN_STRONG_INLINE Packet4ui pmax< Packet4ui >(const Packet4ui &a, const Packet4ui &b)
Definition: LSX/PacketMath.h:1229
EIGEN_STRONG_INLINE int8_t predux_min< Packet4c >(const Packet4c &a)
Definition: NEON/PacketMath.h:3780
EIGEN_STRONG_INLINE Packet8c ploadquad< Packet8c >(const int8_t *from)
Definition: NEON/PacketMath.h:2635
uint16x4_t Packet4us
Definition: NEON/PacketMath.h:85
EIGEN_STRONG_INLINE uint32_t predux_min< Packet4ui >(const Packet4ui &a)
Definition: LSX/PacketMath.h:2112
EIGEN_STRONG_INLINE uint8_t predux_mul< Packet4uc >(const Packet4uc &a)
Definition: NEON/PacketMath.h:3686
EIGEN_STRONG_INLINE Packet4s ploaddup< Packet4s >(const int16_t *from)
Definition: NEON/PacketMath.h:2580
EIGEN_STRONG_INLINE Packet4i psub< Packet4i >(const Packet4i &a, const Packet4i &b)
Definition: AltiVec/PacketMath.h:1099
EIGEN_STRONG_INLINE Packet8c pmax< Packet8c >(const Packet8c &a, const Packet8c &b)
Definition: NEON/PacketMath.h:1570
EIGEN_STRONG_INLINE Packet4f pmin< PropagateNumbers, Packet4f >(const Packet4f &a, const Packet4f &b)
Definition: SSE/PacketMath.h:1137
EIGEN_STRONG_INLINE Packet4ui plset< Packet4ui >(const uint32_t &a)
Definition: LSX/PacketMath.h:548
EIGEN_STRONG_INLINE Packet4bf pnegate< Packet4bf >(const Packet4bf &a)
Definition: NEON/PacketMath.h:5037
EIGEN_STRONG_INLINE Packet4s plset< Packet4s >(const int16_t &a)
Definition: NEON/PacketMath.h:789
EIGEN_STRONG_INLINE Packet2f preciprocal< Packet2f >(const Packet2f &a)
Definition: NEON/PacketMath.h:4641
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter< float, Packet4f >(float *to, const Packet4f &from, Index stride)
Definition: AltiVec/PacketMath.h:954
EIGEN_STRONG_INLINE uint16_t pfirst< Packet4us >(const Packet4us &a)
Definition: NEON/PacketMath.h:3241
EIGEN_STRONG_INLINE Packet2d plset< Packet2d >(const double &a)
Definition: LSX/PacketMath.h:563
EIGEN_STRONG_INLINE uint64_t predux_mul< Packet2ul >(const Packet2ul &a)
Definition: LSX/PacketMath.h:2063
EIGEN_STRONG_INLINE Packet8uc ploadu< Packet8uc >(const uint8_t *from)
Definition: NEON/PacketMath.h:2491
EIGEN_STRONG_INLINE Packet8s pload< Packet8s >(const short int *from)
Definition: AltiVec/PacketMath.h:502
EIGEN_STRONG_INLINE Packet8uc ploadquad< Packet8uc >(const uint8_t *from)
Definition: NEON/PacketMath.h:2652
EIGEN_STRONG_INLINE Packet2ul padd< Packet2ul >(const Packet2ul &a, const Packet2ul &b)
Definition: LSX/PacketMath.h:597
uint8x8_t Packet8uc
Definition: NEON/PacketMath.h:81
EIGEN_STRONG_INLINE Packet8us pxor< Packet8us >(const Packet8us &a, const Packet8us &b)
Definition: AltiVec/PacketMath.h:1456
EIGEN_STRONG_INLINE Packet2f plset< Packet2f >(const float &a)
Definition: NEON/PacketMath.h:751
EIGEN_STRONG_INLINE int64_t predux_mul< Packet2l >(const Packet2l &a)
Definition: LSX/PacketMath.h:2041
EIGEN_STRONG_INLINE Packet8us padd< Packet8us >(const Packet8us &a, const Packet8us &b)
Definition: AltiVec/PacketMath.h:1082
EIGEN_STRONG_INLINE int16_t predux_min< Packet4s >(const Packet4s &a)
Definition: NEON/PacketMath.h:3877
EIGEN_STRONG_INLINE Packet4f pceil< Packet4f >(const Packet4f &a)
Definition: AltiVec/PacketMath.h:1493
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pgather< int8_t, Packet16c >(const int8_t *from, Index stride)
Definition: LSX/PacketMath.h:1626
EIGEN_STRONG_INLINE Packet16uc pand< Packet16uc >(const Packet16uc &a, const Packet16uc &b)
Definition: LSX/PacketMath.h:900
EIGEN_STRONG_INLINE void pstore< float >(float *to, const Packet4f &from)
Definition: AltiVec/PacketMath.h:642
EIGEN_STRONG_INLINE Packet2l pset1< Packet2l >(const int64_t &from)
Definition: LSX/PacketMath.h:478
EIGEN_STRONG_INLINE Packet4bf pcmp_lt< Packet4bf >(const Packet4bf &a, const Packet4bf &b)
Definition: NEON/PacketMath.h:5022
EIGEN_STRONG_INLINE Packet8uc por< Packet8uc >(const Packet8uc &a, const Packet8uc &b)
Definition: NEON/PacketMath.h:1996
EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f &a)
Definition: AltiVec/PacketMath.h:1936
EIGEN_STRONG_INLINE Packet2ui pmul< Packet2ui >(const Packet2ui &a, const Packet2ui &b)
Definition: NEON/PacketMath.h:1181
EIGEN_STRONG_INLINE Packet2ui pxor< Packet2ui >(const Packet2ui &a, const Packet2ui &b)
Definition: NEON/PacketMath.h:2101
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter< uint8_t, Packet4uc >(uint8_t *to, const Packet4uc &from, Index stride)
Definition: NEON/PacketMath.h:3046
EIGEN_STRONG_INLINE Packet4uc pand< Packet4uc >(const Packet4uc &a, const Packet4uc &b)
Definition: NEON/PacketMath.h:1919
EIGEN_STRONG_INLINE Packet2l ploaddup< Packet2l >(const int64_t *from)
Definition: LSX/PacketMath.h:1509
EIGEN_STRONG_INLINE Packet8uc padd< Packet8uc >(const Packet8uc &a, const Packet8uc &b)
Definition: NEON/PacketMath.h:866
EIGEN_ALWAYS_INLINE Packet2f make_packet2f(float a, float b)
Definition: NEON/PacketMath.h:95
EIGEN_STRONG_INLINE Packet2ul por< Packet2ul >(const Packet2ul &a, const Packet2ul &b)
Definition: LSX/PacketMath.h:953
EIGEN_STRONG_INLINE Packet16uc por< Packet16uc >(const Packet16uc &a, const Packet16uc &b)
Definition: LSX/PacketMath.h:941
EIGEN_STRONG_INLINE Packet4f ptrunc< Packet4f >(const Packet4f &a)
Definition: AltiVec/PacketMath.h:1501
EIGEN_STRONG_INLINE void pstore< uint8_t >(uint8_t *to, const Packet16uc &from)
Definition: LSX/PacketMath.h:1557
EIGEN_STRONG_INLINE Packet8uc pand< Packet8uc >(const Packet8uc &a, const Packet8uc &b)
Definition: NEON/PacketMath.h:1923
EIGEN_STRONG_INLINE Packet4i pcmp_le< Packet4i >(const Packet4i &a, const Packet4i &b)
Definition: LSX/PacketMath.h:1056
EIGEN_STRONG_INLINE uint64_t predux< Packet2ul >(const Packet2ul &a)
Definition: LSX/PacketMath.h:2009
EIGEN_STRONG_INLINE Packet8us por< Packet8us >(const Packet8us &a, const Packet8us &b)
Definition: AltiVec/PacketMath.h:1439
float32x2_t Packet2f
Definition: NEON/PacketMath.h:75
EIGEN_STRONG_INLINE Packet4ui pdiv< Packet4ui >(const Packet4ui &a, const Packet4ui &b)
Definition: LSX/PacketMath.h:802
EIGEN_STRONG_INLINE Packet4uc por< Packet4uc >(const Packet4uc &a, const Packet4uc &b)
Definition: NEON/PacketMath.h:1992
EIGEN_STRONG_INLINE Packet4f pset1frombits< Packet4f >(unsigned int from)
Definition: AltiVec/PacketMath.h:803
EIGEN_STRONG_INLINE Packet4bf plset< Packet4bf >(const bfloat16 &a)
Definition: NEON/PacketMath.h:4893
EIGEN_STRONG_INLINE Packet4us pcmp_lt< Packet4us >(const Packet4us &a, const Packet4us &b)
Definition: NEON/PacketMath.h:1761
EIGEN_STRONG_INLINE uint8_t predux_min< Packet4uc >(const Packet4uc &a)
Definition: NEON/PacketMath.h:3813
EIGEN_STRONG_INLINE Packet4uc pandnot< Packet4uc >(const Packet4uc &a, const Packet4uc &b)
Definition: NEON/PacketMath.h:2138
EIGEN_STRONG_INLINE Packet4uc pxor< Packet4uc >(const Packet4uc &a, const Packet4uc &b)
Definition: NEON/PacketMath.h:2065
EIGEN_STRONG_INLINE Packet4f pldexp< Packet4f >(const Packet4f &a, const Packet4f &exponent)
Definition: AltiVec/PacketMath.h:2319
EIGEN_STRONG_INLINE Packet2d ploadu< Packet2d >(const double *from)
Definition: LSX/PacketMath.h:1448
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter< int8_t, Packet8c >(int8_t *to, const Packet8c &from, Index stride)
Definition: NEON/PacketMath.h:3015
EIGEN_STRONG_INLINE Packet4us pdiv< Packet4us >(const Packet4us &, const Packet4us &)
Definition: NEON/PacketMath.h:1240
EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f &a, const Packet4f &b)
Definition: LSX/PacketMath.h:141
EIGEN_STRONG_INLINE float pfirst< Packet2f >(const Packet2f &a)
Definition: NEON/PacketMath.h:3201
EIGEN_STRONG_INLINE Packet4f pcmp_lt< Packet4f >(const Packet4f &a, const Packet4f &b)
Definition: LSX/PacketMath.h:1081
EIGEN_STRONG_INLINE Packet2f pfrexp< Packet2f >(const Packet2f &a, Packet2f &exponent)
Definition: NEON/PacketMath.h:3440
EIGEN_STRONG_INLINE Packet2i pabsdiff< Packet2i >(const Packet2i &a, const Packet2i &b)
Definition: NEON/PacketMath.h:1415
EIGEN_STRONG_INLINE Packet2ul pcmp_le< Packet2ul >(const Packet2ul &a, const Packet2ul &b)
Definition: LSX/PacketMath.h:1076
EIGEN_STRONG_INLINE Packet8c pmin< Packet8c >(const Packet8c &a, const Packet8c &b)
Definition: NEON/PacketMath.h:1469
EIGEN_STRONG_INLINE short int predux_mul< Packet8s >(const Packet8s &a)
Definition: AltiVec/PacketMath.h:2536
EIGEN_STRONG_INLINE int8_t predux_mul< Packet4c >(const Packet4c &a)
Definition: NEON/PacketMath.h:3670
EIGEN_STRONG_INLINE Packet4f pxor< Packet4f >(const Packet4f &a, const Packet4f &b)
Definition: AltiVec/PacketMath.h:1448
EIGEN_STRONG_INLINE void pstoreu< int32_t >(int32_t *to, const Packet4i &from)
Definition: LSX/PacketMath.h:1591
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter< int8_t, Packet16c >(int8_t *to, const Packet16c &from, Index stride)
Definition: LSX/PacketMath.h:1739
EIGEN_STRONG_INLINE Packet4i pmin< Packet4i >(const Packet4i &a, const Packet4i &b)
Definition: AltiVec/PacketMath.h:1261
EIGEN_STRONG_INLINE uint32_t pfirst< Packet2ui >(const Packet2ui &a)
Definition: NEON/PacketMath.h:3257
EIGEN_STRONG_INLINE Packet4uc pmax< Packet4uc >(const Packet4uc &a, const Packet4uc &b)
Definition: NEON/PacketMath.h:1578
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c predux_half_dowto4(const Packet8c &a)
Definition: NEON/PacketMath.h:3635
EIGEN_STRONG_INLINE Packet4uc pcmp_le< Packet4uc >(const Packet4uc &a, const Packet4uc &b)
Definition: NEON/PacketMath.h:1655
EIGEN_STRONG_INLINE void prefetch< uint8_t >(const uint8_t *addr)
Definition: LSX/PacketMath.h:1856
EIGEN_STRONG_INLINE Packet2f ploadu< Packet2f >(const float *from)
Definition: NEON/PacketMath.h:2463
EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f &a)
Definition: LSX/PacketMath.h:2176
EIGEN_STRONG_INLINE Packet2i ploadu< Packet2i >(const int32_t *from)
Definition: NEON/PacketMath.h:2515
EIGEN_STRONG_INLINE Packet16uc plset< Packet16uc >(const unsigned char &a)
Definition: AltiVec/PacketMath.h:1061
EIGEN_STRONG_INLINE Packet16c ploadu< Packet16c >(const signed char *from)
Definition: AltiVec/PacketMath.h:1553
EIGEN_STRONG_INLINE Packet2l pmax< Packet2l >(const Packet2l &a, const Packet2l &b)
Definition: LSX/PacketMath.h:1217
EIGEN_STRONG_INLINE Packet2l psub< Packet2l >(const Packet2l &a, const Packet2l &b)
Definition: LSX/PacketMath.h:622
EIGEN_STRONG_INLINE uint8_t pfirst< Packet4uc >(const Packet4uc &a)
Definition: NEON/PacketMath.h:3221
EIGEN_STRONG_INLINE Packet2d pfrexp< Packet2d >(const Packet2d &a, Packet2d &exponent)
Definition: LSX/PacketMath.h:2677
EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf &a, const Packet2cf &b)
Definition: AltiVec/Complex.h:353
EIGEN_STRONG_INLINE Packet4f vec4f_swizzle1(const Packet4f &a, int p, int q, int r, int s)
Definition: LSX/PacketMath.h:126
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pgather< int32_t, Packet2i >(const int32_t *from, Index stride)
Definition: NEON/PacketMath.h:2958
EIGEN_STRONG_INLINE Packet8s pmin< Packet8s >(const Packet8s &a, const Packet8s &b)
Definition: AltiVec/PacketMath.h:1265
EIGEN_STRONG_INLINE Packet4uc pset1< Packet4uc >(const uint8_t &from)
Definition: NEON/PacketMath.h:689
EIGEN_STRONG_INLINE Packet8s pset1< Packet8s >(const short int &from)
Definition: AltiVec/PacketMath.h:783
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_generic(const Packet &a, const Packet &exponent)
Definition: GenericPacketMathFunctions.h:226
EIGEN_STRONG_INLINE Packet8c pset1< Packet8c >(const int8_t &from)
Definition: NEON/PacketMath.h:681
EIGEN_STRONG_INLINE Packet8s pmul< Packet8s >(const Packet8s &a, const Packet8s &b)
Definition: AltiVec/PacketMath.h:1170
EIGEN_STRONG_INLINE Packet4c pabs< Packet4c >(const Packet4c &a)
Definition: NEON/PacketMath.h:3362
EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f &a, const Packet4f &b)
Definition: LSX/PacketMath.h:138
EIGEN_STRONG_INLINE Packet4bf pmax< PropagateNumbers, Packet4bf >(const Packet4bf &a, const Packet4bf &b)
Definition: NEON/PacketMath.h:4879
EIGEN_STRONG_INLINE Packet16uc pcmp_le< Packet16uc >(const Packet16uc &a, const Packet16uc &b)
Definition: LSX/PacketMath.h:1064
EIGEN_STRONG_INLINE Packet2l pand< Packet2l >(const Packet2l &a, const Packet2l &b)
Definition: LSX/PacketMath.h:896
EIGEN_STRONG_INLINE Packet4ui pxor< Packet4ui >(const Packet4ui &a, const Packet4ui &b)
Definition: LSX/PacketMath.h:990
EIGEN_STRONG_INLINE Packet8h pand(const Packet8h &a, const Packet8h &b)
Definition: AVX/PacketMath.h:2319
EIGEN_STRONG_INLINE int8_t predux< Packet4c >(const Packet4c &a)
Definition: NEON/PacketMath.h:3478
EIGEN_STRONG_INLINE unsigned char pfirst< Packet16uc >(const Packet16uc &a)
Definition: AltiVec/PacketMath.h:1898
EIGEN_STRONG_INLINE Packet2i pcmp_eq< Packet2i >(const Packet2i &a, const Packet2i &b)
Definition: NEON/PacketMath.h:1854
EIGEN_STRONG_INLINE Packet8s pcmp_lt< Packet8s >(const Packet8s &a, const Packet8s &b)
Definition: LSX/PacketMath.h:1093
EIGEN_STRONG_INLINE Packet2f paddsub< Packet2f >(const Packet2f &a, const Packet2f &b)
Definition: NEON/PacketMath.h:992
EIGEN_STRONG_INLINE void pstore< int32_t >(int32_t *to, const Packet4i &from)
Definition: LSX/PacketMath.h:1549
EIGEN_STRONG_INLINE Packet16c pand< Packet16c >(const Packet16c &a, const Packet16c &b)
Definition: LSX/PacketMath.h:884
EIGEN_STRONG_INLINE Packet pdiv_float_common(const Packet &a, const Packet &b)
Definition: NEON/PacketMath.h:4696
EIGEN_STRONG_INLINE Packet4bf pabsdiff< Packet4bf >(const Packet4bf &a, const Packet4bf &b)
Definition: NEON/PacketMath.h:5012
EIGEN_STRONG_INLINE Packet2d ptrunc< Packet2d >(const Packet2d &a)
Definition: LSX/PacketMath.h:2749
EIGEN_STRONG_INLINE Packet4s pload< Packet4s >(const int16_t *from)
Definition: NEON/PacketMath.h:2422
EIGEN_STRONG_INLINE Packet16c pandnot< Packet16c >(const Packet16c &a, const Packet16c &b)
Definition: LSX/PacketMath.h:1007
EIGEN_STRONG_INLINE Packet16uc pmin< Packet16uc >(const Packet16uc &a, const Packet16uc &b)
Definition: AltiVec/PacketMath.h:1277
EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h &a, const Packet8h &b)
Definition: AVX/PacketMath.h:2315
EIGEN_STRONG_INLINE Packet4bf pfloor< Packet4bf >(const Packet4bf &a)
Definition: NEON/PacketMath.h:4928
EIGEN_STRONG_INLINE int pfirst< Packet4i >(const Packet4i &a)
Definition: AltiVec/PacketMath.h:1869
EIGEN_STRONG_INLINE Packet4i plset< Packet4i >(const int &a)
Definition: AltiVec/PacketMath.h:1045
EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d &a, const Packet2d &b)
Definition: LSX/PacketMath.h:160
EIGEN_STRONG_INLINE Packet2ui pcmp_le< Packet2ui >(const Packet2ui &a, const Packet2ui &b)
Definition: NEON/PacketMath.h:1692
static EIGEN_STRONG_INLINE int eigen_neon_shuffle_mask(int p, int q, int r, int s)
Definition: NEON/PacketMath.h:128
EIGEN_STRONG_INLINE float predux< Packet4f >(const Packet4f &a)
Definition: AltiVec/PacketMath.h:2435
EIGEN_STRONG_INLINE bfloat16 predux_mul< Packet4bf >(const Packet4bf &a)
Definition: NEON/PacketMath.h:4998
EIGEN_STRONG_INLINE Packet8uc pdiv< Packet8uc >(const Packet8uc &, const Packet8uc &)
Definition: NEON/PacketMath.h:1220
EIGEN_STRONG_INLINE Packet4bf psub< Packet4bf >(const Packet4bf &a, const Packet4bf &b)
Definition: NEON/PacketMath.h:4958
EIGEN_STRONG_INLINE float predux_mul< Packet2f >(const Packet2f &a)
Definition: NEON/PacketMath.h:3662
EIGEN_STRONG_INLINE Packet2ul pcmp_lt< Packet2ul >(const Packet2ul &a, const Packet2ul &b)
Definition: LSX/PacketMath.h:1117
EIGEN_STRONG_INLINE Packet2d pceil< Packet2d >(const Packet2d &a)
Definition: MSA/PacketMath.h:1186
EIGEN_STRONG_INLINE Packet2l pload< Packet2l >(const int64_t *from)
Definition: LSX/PacketMath.h:1423
EIGEN_STRONG_INLINE void pstore< int16_t >(int16_t *to, const Packet8s &from)
Definition: LSX/PacketMath.h:1545
EIGEN_STRONG_INLINE Packet2f padd< Packet2f >(const Packet2f &a, const Packet2f &b)
Definition: NEON/PacketMath.h:840
EIGEN_STRONG_INLINE Packet4s pmul< Packet4s >(const Packet4s &a, const Packet4s &b)
Definition: NEON/PacketMath.h:1157
EIGEN_STRONG_INLINE Packet4f ploadu< Packet4f >(const float *from)
Definition: AltiVec/PacketMath.h:1533
EIGEN_STRONG_INLINE Packet4bf ptrunc< Packet4bf >(const Packet4bf &a)
Definition: NEON/PacketMath.h:4943
EIGEN_STRONG_INLINE Packet16uc pcmp_eq< Packet16uc >(const Packet16uc &a, const Packet16uc &b)
Definition: LSX/PacketMath.h:1155
EIGEN_STRONG_INLINE Packet2f por< Packet2f >(const Packet2f &a, const Packet2f &b)
Definition: NEON/PacketMath.h:1972
EIGEN_STRONG_INLINE void pstoreu< uint8_t >(uint8_t *to, const Packet16uc &from)
Definition: LSX/PacketMath.h:1599
EIGEN_STRONG_INLINE Packet4i pmul< Packet4i >(const Packet4i &a, const Packet4i &b)
Definition: AltiVec/PacketMath.h:1166
EIGEN_STRONG_INLINE Packet2d print< Packet2d >(const Packet2d &a)
Definition: LSX/PacketMath.h:2745
EIGEN_STRONG_INLINE int32_t predux_max< Packet2i >(const Packet2i &a)
Definition: NEON/PacketMath.h:4071
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pgather< uint32_t, Packet4ui >(const uint32_t *from, Index stride)
Definition: LSX/PacketMath.h:1710
EIGEN_STRONG_INLINE Packet4bf pmin< PropagateNumbers, Packet4bf >(const Packet4bf &a, const Packet4bf &b)
Definition: NEON/PacketMath.h:4865
EIGEN_STRONG_INLINE Packet4bf pcmp_lt_or_nan< Packet4bf >(const Packet4bf &a, const Packet4bf &b)
Definition: NEON/PacketMath.h:5027
EIGEN_STRONG_INLINE Packet4i ploadquad< Packet4i >(const int32_t *from)
Definition: LSX/PacketMath.h:2601
EIGEN_STRONG_INLINE void pstoreu< int8_t >(int8_t *to, const Packet16c &from)
Definition: LSX/PacketMath.h:1583
EIGEN_STRONG_INLINE Packet4i pand< Packet4i >(const Packet4i &a, const Packet4i &b)
Definition: AltiVec/PacketMath.h:1410
EIGEN_STRONG_INLINE Packet2ui pset1< Packet2ui >(const uint32_t &from)
Definition: NEON/PacketMath.h:725
EIGEN_STRONG_INLINE Packet4bf pload< Packet4bf >(const bfloat16 *from)
Definition: NEON/PacketMath.h:4835
EIGEN_STRONG_INLINE Packet2d pmin< Packet2d >(const Packet2d &a, const Packet2d &b)
Definition: LSX/PacketMath.h:1244
EIGEN_STRONG_INLINE Packet16uc ploaddup< Packet16uc >(const unsigned char *from)
Definition: AltiVec/PacketMath.h:1704
EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f &mask, const Packet4f &a, const Packet4f &b)
Definition: AltiVec/PacketMath.h:1474
EIGEN_STRONG_INLINE int predux< Packet4i >(const Packet4i &a)
Definition: AltiVec/PacketMath.h:2445
EIGEN_STRONG_INLINE Packet8us pcmp_eq< Packet8us >(const Packet8us &a, const Packet8us &b)
Definition: LSX/PacketMath.h:1159
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pgather< uint64_t, Packet2ul >(const uint64_t *from, Index stride)
Definition: LSX/PacketMath.h:1719
EIGEN_STRONG_INLINE Packet2f pdiv< Packet2f >(const Packet2f &a, const Packet2f &b)
Definition: NEON/PacketMath.h:4717
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic(const Packet &a, Packet &exponent)
Definition: GenericPacketMathFunctions.h:184
EIGEN_STRONG_INLINE Packet4bf pceil< Packet4bf >(const Packet4bf &a)
Definition: NEON/PacketMath.h:4933
EIGEN_ALWAYS_INLINE Packet2d make_packet2d(double a, double b)
Definition: LSX/PacketMath.h:145
EIGEN_STRONG_INLINE Packet4f pand< Packet4f >(const Packet4f &a, const Packet4f &b)
Definition: AltiVec/PacketMath.h:1406
EIGEN_STRONG_INLINE Packet16c ploadquad< Packet16c >(const signed char *from)
Definition: AltiVec/PacketMath.h:1714
EIGEN_STRONG_INLINE int8_t predux_mul< Packet8c >(const Packet8c &a)
Definition: NEON/PacketMath.h:3676
EIGEN_STRONG_INLINE float predux_max< Packet2f >(const Packet2f &a)
Definition: NEON/PacketMath.h:3940
EIGEN_STRONG_INLINE int predux_max< Packet4i >(const Packet4i &a)
Definition: AltiVec/PacketMath.h:2684
EIGEN_STRONG_INLINE Packet psqrt_float_common(const Packet &a)
Definition: NEON/PacketMath.h:4672
EIGEN_STRONG_INLINE uint8_t predux< Packet8uc >(const Packet8uc &a)
Definition: NEON/PacketMath.h:3568
EIGEN_STRONG_INLINE Packet2l pmin< Packet2l >(const Packet2l &a, const Packet2l &b)
Definition: LSX/PacketMath.h:1184
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter< uint8_t, Packet16uc >(uint8_t *to, const Packet16uc &from, Index stride)
Definition: LSX/PacketMath.h:1785
EIGEN_STRONG_INLINE Packet8us pset1< Packet8us >(const unsigned short int &from)
Definition: AltiVec/PacketMath.h:788
EIGEN_STRONG_INLINE bfloat16 predux< Packet4bf >(const Packet4bf &a)
Definition: NEON/PacketMath.h:4983
EIGEN_STRONG_INLINE Packet2ui ploaddup< Packet2ui >(const uint32_t *from)
Definition: NEON/PacketMath.h:2610
EIGEN_STRONG_INLINE Packet8s pandnot< Packet8s >(const Packet8s &a, const Packet8s &b)
Definition: LSX/PacketMath.h:1011
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pgather< double, Packet2d >(const double *from, Index stride)
Definition: LSX/PacketMath.h:1621
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pgather< int16_t, Packet4s >(const int16_t *from, Index stride)
Definition: NEON/PacketMath.h:2918
EIGEN_STRONG_INLINE Packet2f ploaddup< Packet2f >(const float *from)
Definition: NEON/PacketMath.h:2540
EIGEN_STRONG_INLINE Packet4s pabsdiff< Packet4s >(const Packet4s &a, const Packet4s &b)
Definition: NEON/PacketMath.h:1399
EIGEN_STRONG_INLINE Packet16c ploaddup< Packet16c >(const signed char *from)
Definition: AltiVec/PacketMath.h:1694
EIGEN_STRONG_INLINE int8_t predux< Packet8c >(const Packet8c &a)
Definition: NEON/PacketMath.h:3495
EIGEN_STRONG_INLINE Packet8c ploaddup< Packet8c >(const int8_t *from)
Definition: NEON/PacketMath.h:2553
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pgather< int32_t, Packet4i >(const int32_t *from, Index stride)
Definition: LSX/PacketMath.h:1660
EIGEN_STRONG_INLINE Packet4bf pdiv< Packet4bf >(const Packet4bf &a, const Packet4bf &b)
Definition: NEON/PacketMath.h:4968
EIGEN_STRONG_INLINE Packet4i pcmp_lt< Packet4i >(const Packet4i &a, const Packet4i &b)
Definition: LSX/PacketMath.h:1097
EIGEN_STRONG_INLINE int64_t predux_max< Packet2l >(const Packet2l &a)
Definition: LSX/PacketMath.h:2149
EIGEN_STRONG_INLINE Packet4c pxor< Packet4c >(const Packet4c &a, const Packet4c &b)
Definition: NEON/PacketMath.h:2053
EIGEN_STRONG_INLINE int8_t pfirst< Packet8c >(const Packet8c &a)
Definition: NEON/PacketMath.h:3213
EIGEN_STRONG_INLINE unsigned short int pfirst< Packet8us >(const Packet8us &a)
Definition: AltiVec/PacketMath.h:1888
EIGEN_STRONG_INLINE Packet4i pmax< Packet4i >(const Packet4i &a, const Packet4i &b)
Definition: AltiVec/PacketMath.h:1293
EIGEN_STRONG_INLINE int16_t pfirst< Packet4s >(const Packet4s &a)
Definition: NEON/PacketMath.h:3233
EIGEN_STRONG_INLINE Packet8c pandnot< Packet8c >(const Packet8c &a, const Packet8c &b)
Definition: NEON/PacketMath.h:2130
EIGEN_STRONG_INLINE Packet16uc pxor< Packet16uc >(const Packet16uc &a, const Packet16uc &b)
Definition: LSX/PacketMath.h:982
EIGEN_STRONG_INLINE signed char predux< Packet16c >(const Packet16c &a)
Definition: AltiVec/PacketMath.h:2510
EIGEN_STRONG_INLINE Packet4f pabsdiff< Packet4f >(const Packet4f &a, const Packet4f &b)
Definition: LSX/PacketMath.h:2690
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pgather< int8_t, Packet8c >(const int8_t *from, Index stride)
Definition: NEON/PacketMath.h:2848
EIGEN_STRONG_INLINE unsigned short int predux_mul< Packet8us >(const Packet8us &a)
Definition: AltiVec/PacketMath.h:2547
EIGEN_STRONG_INLINE Packet16c padd< Packet16c >(const Packet16c &a, const Packet16c &b)
Definition: AltiVec/PacketMath.h:1086
EIGEN_STRONG_INLINE Packet4c padd< Packet4c >(const Packet4c &a, const Packet4c &b)
Definition: NEON/PacketMath.h:848
EIGEN_STRONG_INLINE int8_t predux_max< Packet4c >(const Packet4c &a)
Definition: NEON/PacketMath.h:3950
EIGEN_STRONG_INLINE unsigned char predux_min< Packet16uc >(const Packet16uc &a)
Definition: AltiVec/PacketMath.h:2659
EIGEN_STRONG_INLINE short int predux< Packet8s >(const Packet8s &a)
Definition: AltiVec/PacketMath.h:2478
EIGEN_STRONG_INLINE uint8_t predux_mul< Packet8uc >(const Packet8uc &a)
Definition: NEON/PacketMath.h:3692
EIGEN_STRONG_INLINE uint8_t predux< Packet4uc >(const Packet4uc &a)
Definition: NEON/PacketMath.h:3511
EIGEN_STRONG_INLINE Packet2l pmul< Packet2l >(const Packet2l &a, const Packet2l &b)
Definition: LSX/PacketMath.h:757
EIGEN_STRONG_INLINE int16_t predux_max< Packet4s >(const Packet4s &a)
Definition: NEON/PacketMath.h:4047
EIGEN_STRONG_INLINE Packet2i pxor< Packet2i >(const Packet2i &a, const Packet2i &b)
Definition: NEON/PacketMath.h:2093
EIGEN_STRONG_INLINE Packet2d pset1frombits< Packet2d >(uint64_t from)
Definition: LSX/PacketMath.h:513
EIGEN_STRONG_INLINE Packet4i pload< Packet4i >(const int *from)
Definition: AltiVec/PacketMath.h:497
EIGEN_STRONG_INLINE int32_t predux< Packet2i >(const Packet2i &a)
Definition: NEON/PacketMath.h:3607
__vector float Packet4f
Definition: AltiVec/PacketMath.h:33
EIGEN_STRONG_INLINE Packet2d psub< Packet2d >(const Packet2d &a, const Packet2d &b)
Definition: LSX/PacketMath.h:646
EIGEN_STRONG_INLINE Packet4s psub< Packet4s >(const Packet4s &a, const Packet4s &b)
Definition: NEON/PacketMath.h:949
EIGEN_STRONG_INLINE Packet4f psub< Packet4f >(const Packet4f &a, const Packet4f &b)
Definition: AltiVec/PacketMath.h:1095
EIGEN_STRONG_INLINE Packet4c pmul< Packet4c >(const Packet4c &a, const Packet4c &b)
Definition: NEON/PacketMath.h:1131
Packet prsqrt_float_common(const Packet &a)
Definition: NEON/PacketMath.h:4610
EIGEN_STRONG_INLINE Packet8s psub< Packet8s >(const Packet8s &a, const Packet8s &b)
Definition: AltiVec/PacketMath.h:1103
EIGEN_STRONG_INLINE Packet2f pxor< Packet2f >(const Packet2f &a, const Packet2f &b)
Definition: NEON/PacketMath.h:2045
EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f &a)
Definition: LSX/PacketMath.h:2528
EIGEN_STRONG_INLINE Packet4uc padd< Packet4uc >(const Packet4uc &a, const Packet4uc &b)
Definition: NEON/PacketMath.h:861
int32x2_t Packet2i
Definition: NEON/PacketMath.h:87
EIGEN_STRONG_INLINE Packet4f plset< Packet4f >(const float &a)
Definition: AltiVec/PacketMath.h:1041
EIGEN_STRONG_INLINE uint32_t predux_mul< Packet4ui >(const Packet4ui &a)
Definition: LSX/PacketMath.h:2058
EIGEN_STRONG_INLINE Packet2ul psub< Packet2ul >(const Packet2ul &a, const Packet2ul &b)
Definition: LSX/PacketMath.h:638
EIGEN_STRONG_INLINE Packet8uc pcmp_eq< Packet8uc >(const Packet8uc &a, const Packet8uc &b)
Definition: NEON/PacketMath.h:1830
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter< uint16_t, Packet8us >(uint16_t *to, const Packet8us &from, Index stride)
Definition: LSX/PacketMath.h:1805
EIGEN_STRONG_INLINE Packet8s ploadquad< Packet8s >(const short int *from)
Definition: AltiVec/PacketMath.h:1669
EIGEN_STRONG_INLINE uint64_t pfirst< Packet2ul >(const Packet2ul &a)
Definition: LSX/PacketMath.h:1914
EIGEN_STRONG_INLINE Packet4s pdiv< Packet4s >(const Packet4s &, const Packet4s &)
Definition: NEON/PacketMath.h:1230
EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f p4f)
Definition: AltiVec/PacketMath.h:2059
EIGEN_STRONG_INLINE Packet4uc ploadu< Packet4uc >(const uint8_t *from)
Definition: NEON/PacketMath.h:2485
EIGEN_STRONG_INLINE void pstoreu< int64_t >(int64_t *to, const Packet8l &from)
Definition: AVX512/PacketMath.h:1123
EIGEN_STRONG_INLINE void pstoreu< float >(float *to, const Packet4f &from)
Definition: AltiVec/PacketMath.h:1756
EIGEN_STRONG_INLINE Packet2d pmax< Packet2d >(const Packet2d &a, const Packet2d &b)
Definition: LSX/PacketMath.h:1256
EIGEN_ALWAYS_INLINE Packet4f make_packet4f(float a, float b, float c, float d)
Definition: LSX/PacketMath.h:92
EIGEN_STRONG_INLINE Packet4f pround< Packet4f >(const Packet4f &a)
Definition: AltiVec/PacketMath.h:1479
EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f &a, const Packet4f &b)
Definition: AltiVec/PacketMath.h:1329
EIGEN_STRONG_INLINE void prefetch< int16_t >(const int16_t *addr)
Definition: LSX/PacketMath.h:1844
EIGEN_STRONG_INLINE Packet4bf pgather< bfloat16, Packet4bf >(const bfloat16 *from, Index stride)
Definition: NEON/PacketMath.h:4973
EIGEN_STRONG_INLINE Packet16c pset1< Packet16c >(const signed char &from)
Definition: AltiVec/PacketMath.h:793
EIGEN_STRONG_INLINE void pstore< uint64_t >(uint64_t *to, const Packet2ul &from)
Definition: LSX/PacketMath.h:1569
EIGEN_STRONG_INLINE uint8_t predux_max< Packet8uc >(const Packet8uc &a)
Definition: NEON/PacketMath.h:4032
EIGEN_STRONG_INLINE Packet2l plset< Packet2l >(const int64_t &a)
Definition: LSX/PacketMath.h:533
EIGEN_STRONG_INLINE Packet2i pandnot< Packet2i >(const Packet2i &a, const Packet2i &b)
Definition: NEON/PacketMath.h:2166
EIGEN_STRONG_INLINE Packet4ui ploadu< Packet4ui >(const uint32_t *from)
Definition: LSX/PacketMath.h:1476
EIGEN_STRONG_INLINE Packet2d pround< Packet2d >(const Packet2d &a)
Definition: MSA/PacketMath.h:1206
EIGEN_STRONG_INLINE int64_t pfirst< Packet2l >(const Packet2l &a)
Definition: LSX/PacketMath.h:1898
EIGEN_STRONG_INLINE Packet4us pandnot< Packet4us >(const Packet4us &a, const Packet4us &b)
Definition: NEON/PacketMath.h:2158
EIGEN_STRONG_INLINE void pstore< uint32_t >(uint32_t *to, const Packet8ui &from)
Definition: AVX/PacketMath.h:1619
EIGEN_STRONG_INLINE Packet4bf pcmp_eq< Packet4bf >(const Packet4bf &a, const Packet4bf &b)
Definition: NEON/PacketMath.h:5017
EIGEN_STRONG_INLINE Packet4us psub< Packet4us >(const Packet4us &a, const Packet4us &b)
Definition: NEON/PacketMath.h:957
EIGEN_STRONG_INLINE Packet16c pxor< Packet16c >(const Packet16c &a, const Packet16c &b)
Definition: LSX/PacketMath.h:966
EIGEN_STRONG_INLINE Packet4ui ploadquad< Packet4ui >(const uint32_t *from)
Definition: LSX/PacketMath.h:2606
EIGEN_STRONG_INLINE Packet2ul pdiv< Packet2ul >(const Packet2ul &a, const Packet2ul &b)
Definition: LSX/PacketMath.h:806
EIGEN_STRONG_INLINE Packet2i por< Packet2i >(const Packet2i &a, const Packet2i &b)
Definition: NEON/PacketMath.h:2020
EIGEN_STRONG_INLINE void pstoreu< uint64_t >(uint64_t *to, const Packet2ul &from)
Definition: LSX/PacketMath.h:1611
EIGEN_STRONG_INLINE double pfirst< Packet2d >(const Packet2d &a)
Definition: LSX/PacketMath.h:1879
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pgather< float, Packet2f >(const float *from, Index stride)
Definition: NEON/PacketMath.h:2828
EIGEN_STRONG_INLINE uint16_t predux_mul< Packet4us >(const Packet4us &a)
Definition: NEON/PacketMath.h:3718
EIGEN_STRONG_INLINE Packet16uc pabsdiff< Packet16uc >(const Packet16uc &a, const Packet16uc &b)
Definition: LSX/PacketMath.h:2793
EIGEN_STRONG_INLINE Packet2f pabsdiff< Packet2f >(const Packet2f &a, const Packet2f &b)
Definition: NEON/PacketMath.h:1365
EIGEN_STRONG_INLINE unsigned char predux_mul< Packet16uc >(const Packet16uc &a)
Definition: AltiVec/PacketMath.h:2578
EIGEN_STRONG_INLINE void pstoreu< int16_t >(int16_t *to, const Packet8s &from)
Definition: LSX/PacketMath.h:1587
EIGEN_STRONG_INLINE Packet4ui pcmp_eq< Packet4ui >(const Packet4ui &a, const Packet4ui &b)
Definition: LSX/PacketMath.h:1163
eigen_packet_wrapper< __m128i, 7 > Packet2ul
Definition: LSX/PacketMath.h:45
EIGEN_STRONG_INLINE Packet2l pxor< Packet2l >(const Packet2l &a, const Packet2l &b)
Definition: LSX/PacketMath.h:978
int16x4_t Packet4s
Definition: NEON/PacketMath.h:83
EIGEN_STRONG_INLINE Packet4s pcmp_le< Packet4s >(const Packet4s &a, const Packet4s &b)
Definition: NEON/PacketMath.h:1668
EIGEN_STRONG_INLINE Packet4uc ploaddup< Packet4uc >(const uint8_t *from)
Definition: NEON/PacketMath.h:2564
EIGEN_STRONG_INLINE Packet2ui plset< Packet2ui >(const uint32_t &a)
Definition: NEON/PacketMath.h:819
EIGEN_STRONG_INLINE Packet16c pmul< Packet16c >(const Packet16c &a, const Packet16c &b)
Definition: AltiVec/PacketMath.h:1178
EIGEN_STRONG_INLINE Packet4bf ploaddup< Packet4bf >(const bfloat16 *from)
Definition: NEON/PacketMath.h:4855
EIGEN_STRONG_INLINE Packet8s pxor< Packet8s >(const Packet8s &a, const Packet8s &b)
Definition: LSX/PacketMath.h:970
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc pgather< uint8_t, Packet4uc >(const uint8_t *from, Index stride)
Definition: NEON/PacketMath.h:2880
EIGEN_STRONG_INLINE Packet4ui ploaddup< Packet4ui >(const uint32_t *from)
Definition: LSX/PacketMath.h:1523
EIGEN_STRONG_INLINE Packet2f pcmp_le< Packet2f >(const Packet2f &a, const Packet2f &b)
Definition: NEON/PacketMath.h:1634
EIGEN_STRONG_INLINE void pstore< int64_t >(int64_t *to, const Packet8l &from)
Definition: AVX512/PacketMath.h:1106
EIGEN_STRONG_INLINE Packet2f pload< Packet2f >(const float *from)
Definition: NEON/PacketMath.h:2386
EIGEN_STRONG_INLINE Packet8us ploadquad< Packet8us >(const unsigned short int *from)
Definition: AltiVec/PacketMath.h:1679
EIGEN_STRONG_INLINE Packet4f print< Packet4f >(const Packet4f &a)
Definition: LSX/PacketMath.h:2711
EIGEN_STRONG_INLINE Packet2d pfloor< Packet2d >(const Packet2d &a)
Definition: MSA/PacketMath.h:1167
EIGEN_STRONG_INLINE Packet4f preciprocal< Packet4f >(const Packet4f &a)
Definition: LSX/PacketMath.h:2719
EIGEN_STRONG_INLINE Packet2f psub< Packet2f >(const Packet2f &a, const Packet2f &b)
Definition: NEON/PacketMath.h:915
EIGEN_STRONG_INLINE float predux_min< Packet4f >(const Packet4f &a)
Definition: AltiVec/PacketMath.h:2599
EIGEN_STRONG_INLINE void pstore< uint16_t >(uint16_t *to, const Packet8us &from)
Definition: LSX/PacketMath.h:1561
EIGEN_STRONG_INLINE void prefetch< double >(const double *addr)
Definition: AVX/PacketMath.h:1750
EIGEN_STRONG_INLINE uint32_t predux_max< Packet2ui >(const Packet2ui &a)
Definition: NEON/PacketMath.h:4080
EIGEN_STRONG_INLINE Packet4f pmax< Packet4f >(const Packet4f &a, const Packet4f &b)
Definition: AltiVec/PacketMath.h:1282
EIGEN_STRONG_INLINE Packet2ul pand< Packet2ul >(const Packet2ul &a, const Packet2ul &b)
Definition: LSX/PacketMath.h:912
EIGEN_STRONG_INLINE Packet4c pmax< Packet4c >(const Packet4c &a, const Packet4c &b)
Definition: NEON/PacketMath.h:1565
EIGEN_STRONG_INLINE Packet8uc pabsdiff< Packet8uc >(const Packet8uc &a, const Packet8uc &b)
Definition: NEON/PacketMath.h:1391
EIGEN_STRONG_INLINE Packet4c ploadu< Packet4c >(const int8_t *from)
Definition: NEON/PacketMath.h:2471
EIGEN_STRONG_INLINE Packet2i plset< Packet2i >(const int32_t &a)
Definition: NEON/PacketMath.h:809
std::int32_t int32_t
Definition: Meta.h:41
std::int8_t int8_t
Definition: Meta.h:37
std::uint8_t uint8_t
Definition: Meta.h:36
std::int16_t int16_t
Definition: Meta.h:39
std::int64_t int64_t
Definition: Meta.h:43
EIGEN_DEVICE_FUNC const Scalar & q
Definition: SpecialFunctionsImpl.h:2019
std::uint16_t uint16_t
Definition: Meta.h:38
std::uint32_t uint32_t
Definition: Meta.h:40
std::uint64_t uint64_t
Definition: Meta.h:42
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70
auto run(Kernel kernel, Args &&... args) -> decltype(kernel(args...))
Definition: gpu_test_helper.h:414
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:83
CleanedUpDerType< DerType >::type() min(const AutoDiffScalar< DerType > &x, const T &y)
Definition: AutoDiffScalar.h:494
CleanedUpDerType< DerType >::type() max(const AutoDiffScalar< DerType > &x, const T &y)
Definition: AutoDiffScalar.h:499
const Product< Lhs, Rhs > prod(const Lhs &lhs, const Rhs &rhs)
Definition: evaluators.cpp:7
r
Definition: UniformPSDSelfTest.py:20
int c
Definition: calibrate.py:100
val
Definition: calibrate.py:119
Definition: Eigen_Colamd.h:49
list x
Definition: plotDoE.py:28
Holds information about the various numeric (i.e. scalar) types allowed by Eigen.
Definition: NumTraits.h:217
unsigned short value
Definition: BFloat16.h:77
Definition: BFloat16.h:101
numext::uint16_t x
Definition: Half.h:101
Definition: Half.h:139
Definition: GenericPacketMath.h:1407
Packet packet[N]
Definition: GenericPacketMath.h:1408
@ HasASin
Definition: GenericPacketMath.h:84
@ HasATanh
Definition: GenericPacketMath.h:87
@ HasRsqrt
Definition: GenericPacketMath.h:74
@ HasSin
Definition: GenericPacketMath.h:81
@ HasBlend
Definition: GenericPacketMath.h:66
@ HasErfc
Definition: GenericPacketMath.h:96
@ HasACos
Definition: GenericPacketMath.h:85
@ HasAbsDiff
Definition: GenericPacketMath.h:65
@ HasArg
Definition: GenericPacketMath.h:64
@ HasNdtri
Definition: GenericPacketMath.h:97
@ HasCos
Definition: GenericPacketMath.h:82
@ HasCmp
Definition: GenericPacketMath.h:69
@ HasShift
Definition: GenericPacketMath.h:50
@ HasExp
Definition: GenericPacketMath.h:75
@ HasSqrt
Definition: GenericPacketMath.h:73
@ HasErf
Definition: GenericPacketMath.h:95
@ HasBessel
Definition: GenericPacketMath.h:98
@ HasLog
Definition: GenericPacketMath.h:77
@ HasTanh
Definition: GenericPacketMath.h:90
@ HasATan
Definition: GenericPacketMath.h:86
@ HasDiv
Definition: GenericPacketMath.h:71
Definition: GenericPacketMath.h:225
Definition: Meta.h:145
@ value
Definition: Meta.h:146
Packet4bf half
Definition: NEON/PacketMath.h:4736
Packet4bf type
Definition: NEON/PacketMath.h:4735
Packet2f half
Definition: NEON/PacketMath.h:178
Packet4f type
Definition: NEON/PacketMath.h:177
@ HasTanh
Definition: AltiVec/PacketMath.h:200
Packet8s type
Definition: NEON/PacketMath.h:276
Packet4s half
Definition: NEON/PacketMath.h:277
Packet2i half
Definition: NEON/PacketMath.h:332
Packet4i type
Definition: NEON/PacketMath.h:331
Packet2l half
Definition: NEON/PacketMath.h:388
Packet2l type
Definition: NEON/PacketMath.h:387
Packet8c half
Definition: NEON/PacketMath.h:221
Packet16c type
Definition: NEON/PacketMath.h:220
Packet4us half
Definition: NEON/PacketMath.h:304
Packet8us type
Definition: NEON/PacketMath.h:303
Packet2ui half
Definition: NEON/PacketMath.h:359
Packet4ui type
Definition: NEON/PacketMath.h:358
Packet2ul type
Definition: NEON/PacketMath.h:414
Packet2ul half
Definition: NEON/PacketMath.h:415
Packet16uc type
Definition: NEON/PacketMath.h:247
Packet8uc half
Definition: NEON/PacketMath.h:248
T type
Definition: GenericPacketMath.h:109
@ size
Definition: GenericPacketMath.h:113
@ AlignedOnScalar
Definition: GenericPacketMath.h:114
@ Vectorizable
Definition: GenericPacketMath.h:112
T half
Definition: GenericPacketMath.h:110
@ HasSub
Definition: GenericPacketMath.h:118
@ HasMax
Definition: GenericPacketMath.h:124
@ HasNegate
Definition: GenericPacketMath.h:120
@ HasMul
Definition: GenericPacketMath.h:119
@ HasAdd
Definition: GenericPacketMath.h:117
@ HasSetLinear
Definition: GenericPacketMath.h:126
@ HasMin
Definition: GenericPacketMath.h:123
@ HasConj
Definition: GenericPacketMath.h:125
@ HasAbs2
Definition: GenericPacketMath.h:122
@ HasAbs
Definition: GenericPacketMath.h:121
int8_t type
Definition: NEON/PacketMath.h:491
Packet8c half
Definition: NEON/PacketMath.h:492
uint8_t type
Definition: NEON/PacketMath.h:527
Packet8uc half
Definition: NEON/PacketMath.h:528
Packet2f half
Definition: NEON/PacketMath.h:442
Packet2i integer_packet
Definition: NEON/PacketMath.h:443
float type
Definition: NEON/PacketMath.h:441
int32_t type
Definition: NEON/PacketMath.h:587
Packet2i half
Definition: NEON/PacketMath.h:588
Packet2l half
Definition: NEON/PacketMath.h:636
int64_t type
Definition: NEON/PacketMath.h:635
uint32_t type
Definition: NEON/PacketMath.h:611
Packet2ui half
Definition: NEON/PacketMath.h:612
Packet2ul half
Definition: NEON/PacketMath.h:648
uint64_t type
Definition: NEON/PacketMath.h:647
Packet4bf half
Definition: NEON/PacketMath.h:4773
bfloat16 type
Definition: NEON/PacketMath.h:4772
Packet4c half
Definition: NEON/PacketMath.h:468
int8_t type
Definition: NEON/PacketMath.h:467
Packet4i integer_packet
Definition: NEON/PacketMath.h:456
Packet2f half
Definition: NEON/PacketMath.h:455
float type
Definition: NEON/PacketMath.h:454
int32_t type
Definition: NEON/PacketMath.h:599
Packet2i half
Definition: NEON/PacketMath.h:600
int16_t type
Definition: NEON/PacketMath.h:539
Packet4s half
Definition: NEON/PacketMath.h:540
Packet4uc half
Definition: NEON/PacketMath.h:504
uint8_t type
Definition: NEON/PacketMath.h:503
uint32_t type
Definition: NEON/PacketMath.h:623
Packet2ui half
Definition: NEON/PacketMath.h:624
uint16_t type
Definition: NEON/PacketMath.h:563
Packet4us half
Definition: NEON/PacketMath.h:564
int8_t type
Definition: NEON/PacketMath.h:479
Packet4c half
Definition: NEON/PacketMath.h:480
int16_t type
Definition: NEON/PacketMath.h:551
Packet4s half
Definition: NEON/PacketMath.h:552
uint8_t type
Definition: NEON/PacketMath.h:515
Packet4uc half
Definition: NEON/PacketMath.h:516
Packet4us half
Definition: NEON/PacketMath.h:576
uint16_t type
Definition: NEON/PacketMath.h:575
Definition: GenericPacketMath.h:134
numext::get_integer_by_size< sizeof(T)>::signed_type integer_packet
Definition: GenericPacketMath.h:137
T type
Definition: GenericPacketMath.h:135
T half
Definition: GenericPacketMath.h:136
@ masked_load_available
Definition: GenericPacketMath.h:142
@ size
Definition: GenericPacketMath.h:139
@ masked_store_available
Definition: GenericPacketMath.h:143
@ vectorizable
Definition: GenericPacketMath.h:141
@ alignment
Definition: GenericPacketMath.h:140
std::ptrdiff_t j
Definition: tut_arithmetic_redux_minmax.cpp:2
Definition: ZVector/PacketMath.h:50