SSE/PacketMath.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_PACKET_MATH_SSE_H
11 #define EIGEN_PACKET_MATH_SSE_H
12 
13 #include <cstdint>
14 // IWYU pragma: private
15 #include "../../InternalHeaderCheck.h"
16 
17 namespace Eigen {
18 
19 namespace internal {
20 
21 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
22 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
23 #endif
24 
25 #if !defined(EIGEN_VECTORIZE_AVX) && !defined(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS)
26 // 32 bits => 8 registers
27 // 64 bits => 16 registers
28 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2 * sizeof(void*))
29 #endif
30 
31 #ifdef EIGEN_VECTORIZE_FMA
32 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
33 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
34 #endif
35 #endif
36 
37 #if ((defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW || EIGEN_COMP_LCC) && \
38  (__GXX_ABI_VERSION < 1004)) || \
39  EIGEN_OS_QNX
40 // With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot
41 // have overloads for both types without linking error.
42 // One solution is to increase ABI version using -fabi-version=4 (or greater).
43 // Otherwise, we workaround this inconvenience by wrapping 128bit types into the following helper
44 // structure:
45 typedef eigen_packet_wrapper<__m128> Packet4f;
46 typedef eigen_packet_wrapper<__m128d> Packet2d;
47 #else
48 typedef __m128 Packet4f;
49 typedef __m128d Packet2d;
50 #endif
51 
52 typedef eigen_packet_wrapper<__m128i, 0> Packet4i;
56 
57 template <>
58 struct is_arithmetic<__m128> {
59  enum { value = true };
60 };
61 template <>
62 struct is_arithmetic<__m128i> {
63  enum { value = true };
64 };
65 template <>
66 struct is_arithmetic<__m128d> {
67  enum { value = true };
68 };
69 template <>
70 struct is_arithmetic<Packet4i> {
71  enum { value = true };
72 };
73 template <>
74 struct is_arithmetic<Packet2l> {
75  enum { value = true };
76 };
77 // Note that `Packet4ui` uses the underlying type `__m128i`, which is
78 // interpreted as a vector of _signed_ `int32`s, which breaks some arithmetic
79 // operations used in `GenericPacketMath.h`.
80 template <>
81 struct is_arithmetic<Packet4ui> {
82  enum { value = false };
83 };
84 template <>
86  enum { value = true };
87 };
88 
89 template <int p, int q, int r, int s>
90 struct shuffle_mask {
91  enum { mask = (s) << 6 | (r) << 4 | (q) << 2 | (p) };
92 };
93 
94 // TODO: change the implementation of all swizzle* ops from macro to template,
95 #define vec4f_swizzle1(v, p, q, r, s) \
96  Packet4f(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v), (shuffle_mask<p, q, r, s>::mask))))
97 
98 #define vec4i_swizzle1(v, p, q, r, s) Packet4i(_mm_shuffle_epi32(v, (shuffle_mask<p, q, r, s>::mask)))
99 
100 #define vec4ui_swizzle1(v, p, q, r, s) Packet4ui(vec4i_swizzle1(v, p, q, r, s))
101 
102 #define vec2d_swizzle1(v, p, q) \
103  Packet2d(_mm_castsi128_pd( \
104  _mm_shuffle_epi32(_mm_castpd_si128(v), (shuffle_mask<2 * p, 2 * p + 1, 2 * q, 2 * q + 1>::mask))))
105 
106 #define vec4f_swizzle2(a, b, p, q, r, s) Packet4f(_mm_shuffle_ps((a), (b), (shuffle_mask<p, q, r, s>::mask)))
107 
108 #define vec4i_swizzle2(a, b, p, q, r, s) \
109  Packet4i( \
110  _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (shuffle_mask<p, q, r, s>::mask)))))
111 
112 #define vec4ui_swizzle2(a, b, p, q, r, s) Packet4i(vec4i_swizzle2(a, b, p, q, r, s))
113 
115  return Packet4f(_mm_movelh_ps(a, b));
116 }
118  return Packet4f(_mm_movehl_ps(a, b));
119 }
121  return Packet4f(_mm_unpacklo_ps(a, b));
122 }
124  return Packet4f(_mm_unpackhi_ps(a, b));
125 }
126 #define vec4f_duplane(a, p) vec4f_swizzle2(a, a, p, p, p, p)
127 
128 #define vec2d_swizzle2(a, b, mask) Packet2d(_mm_shuffle_pd(a, b, mask))
129 
131  return Packet2d(_mm_unpacklo_pd(a, b));
132 }
134  return Packet2d(_mm_unpackhi_pd(a, b));
135 }
136 #define vec2d_duplane(a, p) vec2d_swizzle2(a, a, (p << 1) | p)
137 
138 #define EIGEN_DECLARE_CONST_Packet4f(NAME, X) const Packet4f p4f_##NAME = pset1<Packet4f>(X)
139 
140 #define EIGEN_DECLARE_CONST_Packet2d(NAME, X) const Packet2d p2d_##NAME = pset1<Packet2d>(X)
141 
142 #define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME, X) const Packet4f p4f_##NAME = pset1frombits<Packet4f>(X)
143 
144 #define EIGEN_DECLARE_CONST_Packet4i(NAME, X) const Packet4i p4i_##NAME = pset1<Packet4i>(X)
145 
146 #define EIGEN_DECLARE_CONST_Packet4ui(NAME, X) const Packet4ui p4ui_##NAME = pset1<Packet4ui>(X)
147 
148 // Work around lack of extract/cvt for epi64 when compiling for 32-bit.
149 #if EIGEN_ARCH_x86_64
150 EIGEN_ALWAYS_INLINE int64_t _mm_extract_epi64_0(const __m128i& a) { return _mm_cvtsi128_si64(a); }
151 #ifdef EIGEN_VECTORIZE_SSE4_1
152 EIGEN_ALWAYS_INLINE int64_t _mm_extract_epi64_1(const __m128i& a) { return _mm_extract_epi64(a, 1); }
153 #else
155  return _mm_cvtsi128_si64(_mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(a), 0x1)));
156 }
157 #endif
158 #else
159 // epi64 instructions are not available. The following seems to generate the same instructions
160 // with -O2 in GCC/Clang.
162  return numext::bit_cast<int64_t>(_mm_cvtsd_f64(_mm_castsi128_pd(a)));
163 }
165  return numext::bit_cast<int64_t>(_mm_cvtsd_f64(_mm_shuffle_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(a), 0x1)));
166 }
167 #endif
168 
169 // Use the packet_traits defined in AVX/PacketMath.h instead if we're going
170 // to leverage AVX instructions.
171 #ifndef EIGEN_VECTORIZE_AVX
172 template <>
173 struct packet_traits<float> : default_packet_traits {
174  typedef Packet4f type;
175  typedef Packet4f half;
176  enum {
177  Vectorizable = 1,
178  AlignedOnScalar = 1,
179  size = 4,
180 
181  HasCmp = 1,
182  HasDiv = 1,
186  HasACos = 1,
187  HasASin = 1,
188  HasATan = 1,
189  HasATanh = 1,
190  HasLog = 1,
191  HasLog1p = 1,
192  HasExpm1 = 1,
193  HasNdtri = 1,
194  HasExp = 1,
195  HasBessel = 1,
196  HasSqrt = 1,
197  HasRsqrt = 1,
201  HasBlend = 1,
202  HasSign = 0 // The manually vectorized version is slightly slower for SSE.
203  };
204 };
205 template <>
206 struct packet_traits<double> : default_packet_traits {
207  typedef Packet2d type;
208  typedef Packet2d half;
209  enum {
210  Vectorizable = 1,
211  AlignedOnScalar = 1,
212  size = 2,
213 
214  HasCmp = 1,
215  HasDiv = 1,
219  HasLog = 1,
222  HasExp = 1,
223  HasSqrt = 1,
224  HasRsqrt = 1,
225  HasATan = 1,
226  HasATanh = 1,
227  HasBlend = 1
228  };
229 };
230 template <>
231 struct packet_traits<int> : default_packet_traits {
232  typedef Packet4i type;
233  typedef Packet4i half;
234  enum {
235  Vectorizable = 1,
236  AlignedOnScalar = 1,
237  size = 4,
238 
239  HasCmp = 1,
240  HasDiv = 1,
241  HasShift = 1,
242  HasBlend = 1
243  };
244 };
245 template <>
246 struct packet_traits<uint32_t> : default_packet_traits {
247  typedef Packet4ui type;
248  typedef Packet4ui half;
249  enum {
250  Vectorizable = 1,
251  AlignedOnScalar = 1,
252  size = 4,
253 
254  HasDiv = 0,
255  HasNegate = 0,
256  HasCmp = 1,
257  HasShift = 1,
258  HasBlend = 1
259  };
260 };
261 template <>
262 struct packet_traits<int64_t> : default_packet_traits {
263  typedef Packet2l type;
264  typedef Packet2l half;
265  enum {
266  Vectorizable = 1,
267  AlignedOnScalar = 1,
268  size = 2,
269 
270  HasDiv = 0,
271  HasCmp = 1,
272  HasShift = 1,
273  HasBlend = 1
274  };
275 };
276 #endif
277 template <>
279  typedef Packet16b type;
280  typedef Packet16b half;
281  enum {
284  size = 16,
285 
286  HasCmp = 1, // note -- only pcmp_eq is defined
287  HasShift = 0,
288  HasAbs = 0,
289  HasAbs2 = 0,
290  HasMin = 0,
291  HasMax = 0,
292  HasConj = 0,
293  HasSqrt = 1,
295  HasSign = 0 // Don't try to vectorize psign<bool> = identity.
296  };
297 };
298 
299 template <>
300 struct unpacket_traits<Packet4f> {
301  typedef float type;
302  typedef Packet4f half;
304  enum {
305  size = 4,
307  vectorizable = true,
308  masked_load_available = false,
309  masked_store_available = false
310  };
311 };
312 template <>
313 struct unpacket_traits<Packet2d> {
314  typedef double type;
315  typedef Packet2d half;
317  enum {
318  size = 2,
320  vectorizable = true,
321  masked_load_available = false,
322  masked_store_available = false
323  };
324 };
325 template <>
326 struct unpacket_traits<Packet2l> {
327  typedef int64_t type;
328  typedef Packet2l half;
329  enum {
330  size = 2,
332  vectorizable = true,
333  masked_load_available = false,
334  masked_store_available = false
335  };
336 };
337 template <>
338 struct unpacket_traits<Packet4i> {
339  typedef int type;
340  typedef Packet4i half;
341  enum {
342  size = 4,
344  vectorizable = true,
345  masked_load_available = false,
346  masked_store_available = false
347  };
348 };
349 template <>
350 struct unpacket_traits<Packet4ui> {
351  typedef uint32_t type;
352  typedef Packet4ui half;
353  enum {
354  size = 4,
356  vectorizable = true,
357  masked_load_available = false,
358  masked_store_available = false
359  };
360 };
361 template <>
363  typedef bool type;
364  typedef Packet16b half;
365  enum {
366  size = 16,
368  vectorizable = true,
370  masked_store_available = false
371  };
372 };
373 
374 #ifndef EIGEN_VECTORIZE_AVX
375 template <>
376 struct scalar_div_cost<float, true> {
377  enum { value = 7 };
378 };
379 template <>
380 struct scalar_div_cost<double, true> {
381  enum { value = 8 };
382 };
383 #endif
384 
385 template <>
386 EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
387  return _mm_set_ps1(from);
388 }
389 template <>
390 EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
391  return _mm_set1_pd(from);
392 }
393 template <>
395  return _mm_set1_epi64x(from);
396 }
397 template <>
399  return _mm_set1_epi32(from);
400 }
401 template <>
403  return _mm_set1_epi32(numext::bit_cast<int32_t>(from));
404 }
405 template <>
407  return _mm_set1_epi8(static_cast<char>(from));
408 }
409 
410 template <>
412  return _mm_castsi128_ps(pset1<Packet4i>(from));
413 }
414 template <>
416  return _mm_castsi128_pd(_mm_set1_epi64x(from));
417 }
418 
419 template <>
421  return _mm_castsi128_ps(_mm_set_epi32(0, -1, 0, -1));
422 }
423 template <>
425  return _mm_set_epi32(0, 0, -1, -1);
426 }
427 template <>
429  return _mm_set_epi32(0, -1, 0, -1);
430 }
431 template <>
433  return _mm_set_epi32(0, -1, 0, -1);
434 }
435 template <>
437  return _mm_castsi128_pd(_mm_set_epi32(0, 0, -1, -1));
438 }
439 
440 template <>
442  return _mm_setzero_ps();
443 }
444 template <>
446  return _mm_setzero_pd();
447 }
448 template <>
450  return _mm_setzero_si128();
451 }
452 template <>
454  return _mm_setzero_si128();
455 }
456 template <>
458  return _mm_setzero_si128();
459 }
460 
461 // GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction.
462 // However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203)
463 // Using inline assembly is also not an option because then gcc fails to reorder properly the instructions.
464 // Therefore, we introduced the pload1 functions to be used in product kernels for which bug 203 does not apply.
465 // Also note that with AVX, we want it to generate a vbroadcastss.
466 #if EIGEN_COMP_GNUC_STRICT && (!defined __AVX__)
467 template <>
468 EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float* from) {
469  return vec4f_swizzle1(_mm_load_ss(from), 0, 0, 0, 0);
470 }
471 #endif
472 
473 template <>
475  return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3, 2, 1, 0));
476 }
477 template <>
479  return _mm_add_pd(pset1<Packet2d>(a), _mm_set_pd(1, 0));
480 }
481 template <>
483  return _mm_add_epi32(pset1<Packet2l>(a), _mm_set_epi64x(1, 0));
484 }
485 template <>
487  return _mm_add_epi32(pset1<Packet4i>(a), _mm_set_epi32(3, 2, 1, 0));
488 }
489 template <>
491  return _mm_add_epi32(pset1<Packet4ui>(a), _mm_set_epi32(3, 2, 1, 0));
492 }
493 
494 template <>
496  return _mm_add_ps(a, b);
497 }
498 template <>
500  return _mm_add_pd(a, b);
501 }
502 template <>
504  return _mm_add_epi64(a, b);
505 }
506 template <>
508  return _mm_add_epi32(a, b);
509 }
510 template <>
512  return _mm_add_epi32(a, b);
513 }
514 
515 template <>
517  return _mm_or_si128(a, b);
518 }
519 
520 template <typename Packet>
522 template <>
524  return _mm_add_ss(a, b);
525 }
526 template <>
528  return _mm_add_sd(a, b);
529 }
530 
531 template <>
533  return _mm_sub_ps(a, b);
534 }
535 template <>
537  return _mm_sub_pd(a, b);
538 }
539 template <>
541  return _mm_sub_epi64(a, b);
542 }
543 template <>
545  return _mm_sub_epi32(a, b);
546 }
547 template <>
549  return _mm_sub_epi32(a, b);
550 }
551 template <>
553  return _mm_xor_si128(a, b);
554 }
555 
556 template <>
558 template <>
560 #ifdef EIGEN_VECTORIZE_SSE3
561  return _mm_addsub_ps(a, b);
562 #else
563  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000, 0x0, 0x80000000, 0x0));
564  return padd(a, pxor(mask, b));
565 #endif
566 }
567 
568 template <>
570 template <>
572 #ifdef EIGEN_VECTORIZE_SSE3
573  return _mm_addsub_pd(a, b);
574 #else
575  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0, 0x80000000, 0x0, 0x0));
576  return padd(a, pxor(mask, b));
577 #endif
578 }
579 
580 template <>
582  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000));
583  return _mm_xor_ps(a, mask);
584 }
585 template <>
587  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0, 0x80000000, 0x0, 0x80000000));
588  return _mm_xor_pd(a, mask);
589 }
590 template <>
592  return psub(pzero(a), a);
593 }
594 
595 template <>
597  return psub(pzero(a), a);
598 }
599 
600 template <>
602  return a;
603 }
604 template <>
606  return a;
607 }
608 template <>
610  return a;
611 }
612 template <>
614  return a;
615 }
616 
617 template <>
619  return _mm_mul_ps(a, b);
620 }
621 template <>
623  return _mm_mul_pd(a, b);
624 }
625 template <>
627  // 64-bit mul requires avx512, so do this with 32-bit multiplication
628  __m128i upper32_a = _mm_srli_epi64(a, 32);
629  __m128i upper32_b = _mm_srli_epi64(b, 32);
630 
631  // upper * lower
632  __m128i mul1 = _mm_mul_epu32(upper32_a, b);
633  __m128i mul2 = _mm_mul_epu32(upper32_b, a);
634  // Gives us both upper*upper and lower*lower
635  __m128i mul3 = _mm_mul_epu32(a, b);
636 
637  __m128i high = _mm_slli_epi64(_mm_add_epi64(mul1, mul2), 32);
638  return _mm_add_epi64(high, mul3);
639 }
640 template <>
642 #ifdef EIGEN_VECTORIZE_SSE4_1
643  return _mm_mullo_epi32(a, b);
644 #else
645  // this version is slightly faster than 4 scalar products
646  return vec4i_swizzle1(
647  vec4i_swizzle2(_mm_mul_epu32(a, b), _mm_mul_epu32(vec4i_swizzle1(a, 1, 0, 3, 2), vec4i_swizzle1(b, 1, 0, 3, 2)),
648  0, 2, 0, 2),
649  0, 2, 1, 3);
650 #endif
651 }
652 template <>
654 #ifdef EIGEN_VECTORIZE_SSE4_1
655  return _mm_mullo_epi32(a, b);
656 #else
657  // this version is slightly faster than 4 scalar products
658  return vec4ui_swizzle1(
659  vec4ui_swizzle2(_mm_mul_epu32(a, b),
660  _mm_mul_epu32(vec4ui_swizzle1(a, 1, 0, 3, 2), vec4ui_swizzle1(b, 1, 0, 3, 2)), 0, 2, 0, 2),
661  0, 2, 1, 3);
662 #endif
663 }
664 
665 template <>
667  return _mm_and_si128(a, b);
668 }
669 
670 template <>
672  return _mm_div_ps(a, b);
673 }
674 template <>
676  return _mm_div_pd(a, b);
677 }
678 
679 template <>
681 #ifdef EIGEN_VECTORIZE_AVX
682  return _mm256_cvttpd_epi32(_mm256_div_pd(_mm256_cvtepi32_pd(a), _mm256_cvtepi32_pd(b)));
683 #else
684  __m128i q_lo = _mm_cvttpd_epi32(_mm_div_pd(_mm_cvtepi32_pd(a), _mm_cvtepi32_pd(b)));
685  __m128i q_hi = _mm_cvttpd_epi32(
686  _mm_div_pd(_mm_cvtepi32_pd(vec4i_swizzle1(a, 2, 3, 0, 1)), _mm_cvtepi32_pd(vec4i_swizzle1(b, 2, 3, 0, 1))));
687  return vec4i_swizzle1(_mm_unpacklo_epi32(q_lo, q_hi), 0, 2, 1, 3);
688 #endif
689 }
690 
691 #ifdef EIGEN_VECTORIZE_FMA
692 template <>
693 EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
694  return _mm_fmadd_ps(a, b, c);
695 }
696 template <>
697 EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
698  return _mm_fmadd_pd(a, b, c);
699 }
700 template <>
701 EIGEN_STRONG_INLINE Packet4f pmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
702  return _mm_fmsub_ps(a, b, c);
703 }
704 template <>
705 EIGEN_STRONG_INLINE Packet2d pmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
706  return _mm_fmsub_pd(a, b, c);
707 }
708 template <>
709 EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
710  return _mm_fnmadd_ps(a, b, c);
711 }
712 template <>
713 EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
714  return _mm_fnmadd_pd(a, b, c);
715 }
716 template <>
717 EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
718  return _mm_fnmsub_ps(a, b, c);
719 }
720 template <>
721 EIGEN_STRONG_INLINE Packet2d pnmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
722  return _mm_fnmsub_pd(a, b, c);
723 }
724 
725 template <typename Packet>
726 EIGEN_STRONG_INLINE Packet pmadds(const Packet& a, const Packet& b, const Packet& c);
727 template <>
728 EIGEN_STRONG_INLINE Packet4f pmadds<Packet4f>(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
729  return _mm_fmadd_ss(a, b, c);
730 }
731 template <>
732 EIGEN_STRONG_INLINE Packet2d pmadds<Packet2d>(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
733  return _mm_fmadd_sd(a, b, c);
734 }
735 #endif
736 
737 #ifdef EIGEN_VECTORIZE_SSE4_1
738 template <>
739 EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
740  return _mm_blendv_ps(b, a, mask);
741 }
742 
743 template <>
744 EIGEN_STRONG_INLINE Packet2l pselect(const Packet2l& mask, const Packet2l& a, const Packet2l& b) {
745  return _mm_castpd_si128(_mm_blendv_pd(_mm_castsi128_pd(b), _mm_castsi128_pd(a), _mm_castsi128_pd(mask)));
746 }
747 
748 template <>
749 EIGEN_STRONG_INLINE Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) {
750  return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b), _mm_castsi128_ps(a), _mm_castsi128_ps(mask)));
751 }
752 
753 template <>
754 EIGEN_STRONG_INLINE Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b) {
755  return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b), _mm_castsi128_ps(a), _mm_castsi128_ps(mask)));
756 }
757 
758 template <>
759 EIGEN_STRONG_INLINE Packet2d pselect(const Packet2d& mask, const Packet2d& a, const Packet2d& b) {
760  return _mm_blendv_pd(b, a, mask);
761 }
762 #endif
763 
764 template <>
766  return _mm_cmpeq_epi32(a, a);
767 }
768 template <>
770  return _mm_cmpeq_epi32(a, a);
771 }
772 template <>
774  return pset1<Packet16b>(true);
775 }
776 template <>
778  Packet4i b = _mm_castps_si128(a);
779  return _mm_castsi128_ps(_mm_cmpeq_epi32(b, b));
780 }
781 template <>
783  Packet4i b = _mm_castpd_si128(a);
784  return _mm_castsi128_pd(_mm_cmpeq_epi32(b, b));
785 }
786 
787 template <>
789  return _mm_and_ps(a, b);
790 }
791 template <>
793  return _mm_and_pd(a, b);
794 }
795 template <>
797  return _mm_and_si128(a, b);
798 }
799 template <>
801  return _mm_and_si128(a, b);
802 }
803 template <>
805  return _mm_and_si128(a, b);
806 }
807 template <>
809  return _mm_and_si128(a, b);
810 }
811 
812 template <>
814  return _mm_or_ps(a, b);
815 }
816 template <>
818  return _mm_or_pd(a, b);
819 }
820 template <>
822  return _mm_or_si128(a, b);
823 }
824 template <>
826  return _mm_or_si128(a, b);
827 }
828 template <>
830  return _mm_or_si128(a, b);
831 }
832 template <>
834  return _mm_or_si128(a, b);
835 }
836 
837 template <>
839  return _mm_xor_ps(a, b);
840 }
841 template <>
843  return _mm_xor_pd(a, b);
844 }
845 template <>
847  return _mm_xor_si128(a, b);
848 }
849 template <>
851  return _mm_xor_si128(a, b);
852 }
853 template <>
855  return _mm_xor_si128(a, b);
856 }
857 template <>
859  return _mm_xor_si128(a, b);
860 }
861 
862 template <>
864  return _mm_andnot_ps(b, a);
865 }
866 template <>
868  return _mm_andnot_pd(b, a);
869 }
870 template <>
872  return _mm_andnot_si128(b, a);
873 }
874 template <>
876  return _mm_andnot_si128(b, a);
877 }
878 template <>
880  return _mm_andnot_si128(b, a);
881 }
882 
883 template <>
885  return _mm_cmple_ps(a, b);
886 }
887 template <>
889  return _mm_cmplt_ps(a, b);
890 }
891 template <>
893  return _mm_cmpnge_ps(a, b);
894 }
895 template <>
897  return _mm_cmpeq_ps(a, b);
898 }
899 
900 template <>
902  return _mm_cmple_pd(a, b);
903 }
904 template <>
906  return _mm_cmplt_pd(a, b);
907 }
908 template <>
910  return _mm_cmpnge_pd(a, b);
911 }
912 template <>
914  return _mm_cmpeq_pd(a, b);
915 }
916 template <>
918  return _mm_cmplt_epi32(a, b);
919 }
920 template <>
922  return _mm_cmpeq_epi32(a, b);
923 }
924 template <>
926  return por(pcmp_lt(a, b), pcmp_eq(a, b));
927 }
928 template <>
930 #ifdef EIGEN_VECTORIZE_SSE4_2
931  return _mm_cmpgt_epi64(b, a);
932 #else
934  Packet2l hi_eq = Packet2l(_mm_shuffle_epi32(eq, (shuffle_mask<1, 1, 3, 3>::mask)));
936  Packet2l hi_lt = Packet2l(_mm_shuffle_epi32(lt, (shuffle_mask<1, 1, 3, 3>::mask)));
937  Packet2l lo_lt = Packet2l(_mm_shuffle_epi32(lt, (shuffle_mask<0, 0, 2, 2>::mask)));
938  // return hi(a) < hi(b) || (hi(a) == hi(b) && lo(a) < lo(b))
939  return por(hi_lt, pand(hi_eq, lo_lt));
940 #endif
941 }
942 template <>
944 #ifdef EIGEN_VECTORIZE_SSE4_1
945  return _mm_cmpeq_epi64(a, b);
946 #else
948  return Packet2l(pand<Packet4i>(tmp, _mm_shuffle_epi32(tmp, (shuffle_mask<1, 0, 3, 2>::mask))));
949 #endif
950 }
951 template <>
953  return por(pcmp_lt(a, b), pcmp_eq(a, b));
954 }
955 template <>
957  // Mask out invalid bool bits to avoid UB.
958  const Packet16b kBoolMask = pset1<Packet16b>(true);
959  return _mm_and_si128(_mm_cmpeq_epi8(a, b), kBoolMask);
960 }
961 template <>
963  return _mm_cmpeq_epi32(a, b);
964 }
965 
966 template <>
968 #if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
969 // There appears to be a bug in GCC, by which the optimizer may
970 // flip the argument order in calls to _mm_min_ps, so we have to
971 // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
972 // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
973 #ifdef EIGEN_VECTORIZE_AVX
974  Packet4f res;
975  asm("vminps %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
976 #else
977  Packet4f res = b;
978  asm("minps %[a], %[res]" : [res] "+x"(res) : [a] "x"(a));
979 #endif
980  return res;
981 #else
982  // Arguments are reversed to match NaN propagation behavior of std::min.
983  return _mm_min_ps(b, a);
984 #endif
985 }
986 template <>
988 #if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
989 // There appears to be a bug in GCC, by which the optimizer may
990 // flip the argument order in calls to _mm_min_pd, so we have to
991 // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
992 // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
993 #ifdef EIGEN_VECTORIZE_AVX
994  Packet2d res;
995  asm("vminpd %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
996 #else
997  Packet2d res = b;
998  asm("minpd %[a], %[res]" : [res] "+x"(res) : [a] "x"(a));
999 #endif
1000  return res;
1001 #else
1002  // Arguments are reversed to match NaN propagation behavior of std::min.
1003  return _mm_min_pd(b, a);
1004 #endif
1005 }
1006 template <>
1008  Packet2l a_lt_mask = pcmp_lt(a, b);
1009  return por(pandnot(b, a_lt_mask), pand(a, a_lt_mask));
1010 }
1011 template <>
1013 #ifdef EIGEN_VECTORIZE_SSE4_1
1014  return _mm_min_epi32(a, b);
1015 #else
1016  // after some bench, this version *is* faster than a scalar implementation
1017  Packet4i mask = _mm_cmplt_epi32(a, b);
1018  return _mm_or_si128(_mm_and_si128(mask, a), _mm_andnot_si128(mask, b));
1019 #endif
1020 }
1021 template <>
1023 #ifdef EIGEN_VECTORIZE_SSE4_1
1024  return _mm_min_epu32(a, b);
1025 #else
1026  return padd((Packet4ui)pmin((Packet4i)psub(a, pset1<Packet4ui>(0x80000000UL)),
1027  (Packet4i)psub(b, pset1<Packet4ui>(0x80000000UL))),
1028  pset1<Packet4ui>(0x80000000UL));
1029 #endif
1030 }
1031 
1032 template <>
1034 #if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
1035 // There appears to be a bug in GCC, by which the optimizer may
1036 // flip the argument order in calls to _mm_max_ps, so we have to
1037 // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
1038 // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
1039 #ifdef EIGEN_VECTORIZE_AVX
1040  Packet4f res;
1041  asm("vmaxps %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
1042 #else
1043  Packet4f res = b;
1044  asm("maxps %[a], %[res]" : [res] "+x"(res) : [a] "x"(a));
1045 #endif
1046  return res;
1047 #else
1048  // Arguments are reversed to match NaN propagation behavior of std::max.
1049  return _mm_max_ps(b, a);
1050 #endif
1051 }
1052 template <>
1054 #if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
1055 // There appears to be a bug in GCC, by which the optimizer may
1056 // flip the argument order in calls to _mm_max_pd, so we have to
1057 // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
1058 // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
1059 #ifdef EIGEN_VECTORIZE_AVX
1060  Packet2d res;
1061  asm("vmaxpd %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
1062 #else
1063  Packet2d res = b;
1064  asm("maxpd %[a], %[res]" : [res] "+x"(res) : [a] "x"(a));
1065 #endif
1066  return res;
1067 #else
1068  // Arguments are reversed to match NaN propagation behavior of std::max.
1069  return _mm_max_pd(b, a);
1070 #endif
1071 }
1072 template <>
1074  Packet2l a_lt_mask = pcmp_lt(a, b);
1075  return por(pandnot(a, a_lt_mask), pand(b, a_lt_mask));
1076 }
1077 template <>
1079 #ifdef EIGEN_VECTORIZE_SSE4_1
1080  return _mm_max_epi32(a, b);
1081 #else
1082  // after some bench, this version *is* faster than a scalar implementation
1083  Packet4i mask = _mm_cmpgt_epi32(a, b);
1084  return _mm_or_si128(_mm_and_si128(mask, a), _mm_andnot_si128(mask, b));
1085 #endif
1086 }
1087 template <>
1089 #ifdef EIGEN_VECTORIZE_SSE4_1
1090  return _mm_max_epu32(a, b);
1091 #else
1092  return padd((Packet4ui)pmax((Packet4i)psub(a, pset1<Packet4ui>(0x80000000UL)),
1093  (Packet4i)psub(b, pset1<Packet4ui>(0x80000000UL))),
1094  pset1<Packet4ui>(0x80000000UL));
1095 #endif
1096 }
1097 
1098 template <>
1100 #ifdef EIGEN_VECTORIZE_SSE4_1
1101  return pxor(pcmp_eq(a, pmax(a, b)), ptrue(a));
1102 #else
1103  return (Packet4ui)pcmp_lt((Packet4i)psub(a, pset1<Packet4ui>(0x80000000UL)),
1104  (Packet4i)psub(b, pset1<Packet4ui>(0x80000000UL)));
1105 #endif
1106 }
1107 template <>
1109 #ifdef EIGEN_VECTORIZE_SSE4_1
1110  return pcmp_eq(a, pmin(a, b));
1111 #else
1112  return (Packet4ui)pcmp_le((Packet4i)psub(a, pset1<Packet4ui>(0x80000000UL)),
1113  (Packet4i)psub(b, pset1<Packet4ui>(0x80000000UL)));
1114 #endif
1115 }
1116 
1117 template <typename Packet, typename Op>
1119  // In this implementation, we take advantage of the fact that pmin/pmax for SSE
1120  // always return a if either a or b is NaN.
1121  Packet not_nan_mask_a = pcmp_eq(a, a);
1122  Packet m = op(a, b);
1123  return pselect<Packet>(not_nan_mask_a, m, b);
1124 }
1125 
1126 template <typename Packet, typename Op>
1128  // In this implementation, we take advantage of the fact that pmin/pmax for SSE
1129  // always return a if either a or b is NaN.
1130  Packet not_nan_mask_a = pcmp_eq(a, a);
1131  Packet m = op(b, a);
1132  return pselect<Packet>(not_nan_mask_a, m, a);
1133 }
1134 
1135 // Add specializations for min/max with prescribed NaN propagation.
1136 template <>
1139 }
1140 template <>
1143 }
1144 template <>
1147 }
1148 template <>
1151 }
1152 template <>
1155 }
1156 template <>
1159 }
1160 template <>
1163 }
1164 template <>
1167 }
1168 
1169 template <>
1171  return _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(a), 31));
1172 }
1173 template <>
1175  Packet4f tmp = psignbit<Packet4f>(_mm_castpd_ps(a));
1176 #ifdef EIGEN_VECTORIZE_AVX
1177  return _mm_castps_pd(_mm_permute_ps(tmp, (shuffle_mask<1, 1, 3, 3>::mask)));
1178 #else
1179  return _mm_castps_pd(_mm_shuffle_ps(tmp, tmp, (shuffle_mask<1, 1, 3, 3>::mask)));
1180 #endif // EIGEN_VECTORIZE_AVX
1181 }
1182 template <>
1184  return _mm_srai_epi32(a, 31);
1185 }
1186 template <>
1188  return pzero(a);
1189 }
1190 template <>
1192  Packet4i tmp = psignbit<Packet4i>(Packet4i(a));
1193  return Packet2l(_mm_shuffle_epi32(tmp, (shuffle_mask<1, 1, 3, 3>::mask)));
1194 }
1195 
1196 template <int N>
1199  return por(_mm_slli_epi64(signbit, 64 - N), _mm_srli_epi64(a, N));
1200 }
1201 template <int N>
1203  return _mm_srli_epi64(a, N);
1204 }
1205 template <int N>
1207  return _mm_slli_epi64(a, N);
1208 }
1209 template <int N>
1211  return _mm_srai_epi32(a, N);
1212 }
1213 template <int N>
1215  return _mm_srli_epi32(a, N);
1216 }
1217 template <int N>
1219  return _mm_slli_epi32(a, N);
1220 }
1221 template <int N>
1223  return _mm_srli_epi32(a, N);
1224 }
1225 template <int N>
1227  return _mm_srli_epi32(a, N);
1228 }
1229 template <int N>
1231  return _mm_slli_epi32(a, N);
1232 }
1233 
1234 template <>
1236  const __m128i mask = _mm_setr_epi32(0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF);
1237  return _mm_castsi128_ps(_mm_and_si128(mask, _mm_castps_si128(a)));
1238 }
1239 template <>
1241  const __m128i mask = _mm_setr_epi32(0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF);
1242  return _mm_castsi128_pd(_mm_and_si128(mask, _mm_castpd_si128(a)));
1243 }
1244 template <>
1247  return _mm_sub_epi64(_mm_xor_si128(a, signbit), signbit);
1248 }
1249 template <>
1251 #ifdef EIGEN_VECTORIZE_SSSE3
1252  return _mm_abs_epi32(a);
1253 #else
1255  return _mm_sub_epi32(_mm_xor_si128(a, signbit), signbit);
1256 #endif
1257 }
1258 template <>
1260  return a;
1261 }
1262 
1263 #ifdef EIGEN_VECTORIZE_SSE4_1
1264 template <>
1266  // Unfortunately _mm_round_ps doesn't have a rounding mode to implement numext::round.
1267  const Packet4f mask = pset1frombits<Packet4f>(0x80000000u);
1268  const Packet4f prev0dot5 = pset1frombits<Packet4f>(0x3EFFFFFFu);
1269  return _mm_round_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
1270 }
1271 
1272 template <>
1274  const Packet2d mask = _mm_castsi128_pd(_mm_set_epi64x(0x8000000000000000ull, 0x8000000000000000ull));
1275  const Packet2d prev0dot5 = _mm_castsi128_pd(_mm_set_epi64x(0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull));
1276  return _mm_round_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
1277 }
1278 
1279 template <>
1281  return _mm_round_ps(a, _MM_FROUND_CUR_DIRECTION);
1282 }
1283 template <>
1285  return _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
1286 }
1287 
1288 template <>
1290  return _mm_ceil_ps(a);
1291 }
1292 template <>
1294  return _mm_ceil_pd(a);
1295 }
1296 
1297 template <>
1299  return _mm_floor_ps(a);
1300 }
1301 template <>
1303  return _mm_floor_pd(a);
1304 }
1305 
1306 template <>
1308  return _mm_round_ps(a, _MM_FROUND_TRUNC);
1309 }
1310 template <>
1312  return _mm_round_pd(a, _MM_FROUND_TRUNC);
1313 }
1314 #endif
1315 
1316 template <>
1317 EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
1318  EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from);
1319 }
1320 template <>
1321 EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
1322  EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from);
1323 }
1324 template <>
1326  EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
1327 }
1328 template <>
1329 EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) {
1330  EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
1331 }
1332 template <>
1334  EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
1335 }
1336 template <>
1338  EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
1339 }
1340 
1341 #if EIGEN_COMP_MSVC
1342 template <>
1343 EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
1345  return _mm_loadu_ps(from);
1346 }
1347 #else
1348 // NOTE: with the code below, MSVC's compiler crashes!
1349 
1350 template <>
1351 EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
1353  return _mm_loadu_ps(from);
1354 }
1355 #endif
1356 
1357 template <>
1358 EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
1360  return _mm_loadu_pd(from);
1361 }
1362 template <>
1365  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
1366 }
1367 template <>
1370  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
1371 }
1372 template <>
1375  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
1376 }
1377 template <>
1380  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
1381 }
1382 
1383 // Load lower part of packet zero extending.
1384 template <typename Packet>
1386 template <>
1388  EIGEN_DEBUG_UNALIGNED_LOAD return _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(from)));
1389 }
1390 template <>
1392  EIGEN_DEBUG_UNALIGNED_LOAD return _mm_load_sd(from);
1393 }
1394 
1395 // Load scalar
1396 template <typename Packet>
1398 template <>
1400  EIGEN_DEBUG_UNALIGNED_LOAD return _mm_load_ss(from);
1401 }
1402 template <>
1404  EIGEN_DEBUG_UNALIGNED_LOAD return _mm_load_sd(from);
1405 }
1406 
1407 template <>
1408 EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
1409  return vec4f_swizzle1(_mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(from))), 0, 0, 1, 1);
1410 }
1411 template <>
1412 EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
1413  return pset1<Packet2d>(from[0]);
1414 }
1415 template <>
1417  return pset1<Packet2l>(from[0]);
1418 }
1419 template <>
1421  Packet4i tmp;
1422  tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(from));
1423  return vec4i_swizzle1(tmp, 0, 0, 1, 1);
1424 }
1425 template <>
1427  Packet4ui tmp;
1428  tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(from));
1429  return vec4ui_swizzle1(tmp, 0, 0, 1, 1);
1430 }
1431 
1432 // Loads 8 bools from memory and returns the packet
1433 // {b0, b0, b1, b1, b2, b2, b3, b3, b4, b4, b5, b5, b6, b6, b7, b7}
1434 template <>
1436  __m128i tmp = _mm_castpd_si128(pload1<Packet2d>(reinterpret_cast<const double*>(from)));
1437  return _mm_unpacklo_epi8(tmp, tmp);
1438 }
1439 
1440 // Loads 4 bools from memory and returns the packet
1441 // {b0, b0 b0, b0, b1, b1, b1, b1, b2, b2, b2, b2, b3, b3, b3, b3}
1442 template <>
1444  __m128i tmp = _mm_castps_si128(pload1<Packet4f>(reinterpret_cast<const float*>(from)));
1445  tmp = _mm_unpacklo_epi8(tmp, tmp);
1446  return _mm_unpacklo_epi16(tmp, tmp);
1447 }
1448 
1449 template <>
1450 EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
1451  EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from);
1452 }
1453 template <>
1454 EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
1455  EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from);
1456 }
1457 template <>
1458 EIGEN_STRONG_INLINE void pstore<int64_t>(int64_t* to, const Packet2l& from) {
1459  EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
1460 }
1461 template <>
1462 EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) {
1463  EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
1464 }
1465 template <>
1466 EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet4ui& from) {
1467  EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
1468 }
1469 template <>
1470 EIGEN_STRONG_INLINE void pstore<bool>(bool* to, const Packet16b& from) {
1471  EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
1472 }
1473 
1474 template <>
1475 EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
1476  EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from);
1477 }
1478 template <>
1479 EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
1480  EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ps(to, from);
1481 }
1482 template <>
1483 EIGEN_STRONG_INLINE void pstoreu<int64_t>(int64_t* to, const Packet2l& from) {
1484  EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
1485 }
1486 template <>
1487 EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) {
1488  EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
1489 }
1490 template <>
1492  EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
1493 }
1494 template <>
1495 EIGEN_STRONG_INLINE void pstoreu<bool>(bool* to, const Packet16b& from) {
1496  EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
1497 }
1498 
1499 template <typename Scalar, typename Packet>
1500 EIGEN_STRONG_INLINE void pstorel(Scalar* to, const Packet& from);
1501 template <>
1502 EIGEN_STRONG_INLINE void pstorel(float* to, const Packet4f& from) {
1503  EIGEN_DEBUG_UNALIGNED_STORE _mm_storel_pi(reinterpret_cast<__m64*>(to), from);
1504 }
1505 template <>
1506 EIGEN_STRONG_INLINE void pstorel(double* to, const Packet2d& from) {
1507  EIGEN_DEBUG_UNALIGNED_STORE _mm_storel_pd(to, from);
1508 }
1509 
1510 template <typename Scalar, typename Packet>
1511 EIGEN_STRONG_INLINE void pstores(Scalar* to, const Packet& from);
1512 template <>
1513 EIGEN_STRONG_INLINE void pstores(float* to, const Packet4f& from) {
1514  EIGEN_DEBUG_UNALIGNED_STORE _mm_store_ss(to, from);
1515 }
1516 template <>
1517 EIGEN_STRONG_INLINE void pstores(double* to, const Packet2d& from) {
1518  EIGEN_DEBUG_UNALIGNED_STORE _mm_store_sd(to, from);
1519 }
1520 
1521 template <>
1523  return _mm_shuffle_ps(a, a, 0x1B);
1524 }
1525 template <>
1527  return _mm_shuffle_pd(a, a, 0x1);
1528 }
1529 template <>
1531  return _mm_castpd_si128(preverse(_mm_castsi128_pd(a)));
1532 }
1533 template <>
1535  return _mm_shuffle_epi32(a, 0x1B);
1536 }
1537 template <>
1539  return _mm_shuffle_epi32(a, 0x1B);
1540 }
1541 template <>
1543 #ifdef EIGEN_VECTORIZE_SSSE3
1544  __m128i mask = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
1545  return _mm_shuffle_epi8(a, mask);
1546 #else
1547  Packet16b tmp = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 1, 2, 3));
1548  tmp = _mm_shufflehi_epi16(_mm_shufflelo_epi16(tmp, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1));
1549  return _mm_or_si128(_mm_slli_epi16(tmp, 8), _mm_srli_epi16(tmp, 8));
1550 #endif
1551 }
1552 
1553 #if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64
1554 // The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
1555 // Direct of the struct members fixed bug #62.
1556 template <>
1558  return a.m128_f32[0];
1559 }
1560 template <>
1562  return a.m128d_f64[0];
1563 }
1564 template <>
1567  return x;
1568 }
1569 template <>
1571  int x = _mm_cvtsi128_si32(a);
1572  return x;
1573 }
1574 template <>
1576  uint32_t x = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a));
1577  return x;
1578 }
1579 #elif EIGEN_COMP_MSVC_STRICT
1580 // The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
1581 template <>
1583  float x = _mm_cvtss_f32(a);
1584  return x;
1585 }
1586 template <>
1588  double x = _mm_cvtsd_f64(a);
1589  return x;
1590 }
1591 template <>
1594  return x;
1595 }
1596 template <>
1598  int x = _mm_cvtsi128_si32(a);
1599  return x;
1600 }
1601 template <>
1603  uint32_t x = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a));
1604  return x;
1605 }
1606 #else
1607 template <>
1609  return _mm_cvtss_f32(a);
1610 }
1611 template <>
1613  return _mm_cvtsd_f64(a);
1614 }
1615 template <>
1617  return _mm_extract_epi64_0(a);
1618 }
1619 template <>
1621  return _mm_cvtsi128_si32(a);
1622 }
1623 template <>
1625  return numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a));
1626 }
1627 #endif
1628 template <>
1630  int x = _mm_cvtsi128_si32(a);
1631  return static_cast<bool>(x & 1);
1632 }
1633 
1634 template <>
1635 EIGEN_STRONG_INLINE Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
1636  return _mm_set_ps(from[3 * stride], from[2 * stride], from[1 * stride], from[0 * stride]);
1637 }
1638 template <>
1639 EIGEN_STRONG_INLINE Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
1640  return _mm_set_pd(from[1 * stride], from[0 * stride]);
1641 }
1642 template <>
1644  return _mm_set_epi64x(from[1 * stride], from[0 * stride]);
1645 }
1646 template <>
1647 EIGEN_STRONG_INLINE Packet4i pgather<int, Packet4i>(const int* from, Index stride) {
1648  return _mm_set_epi32(from[3 * stride], from[2 * stride], from[1 * stride], from[0 * stride]);
1649 }
1650 template <>
1652  return _mm_set_epi32(numext::bit_cast<int32_t>(from[3 * stride]), numext::bit_cast<int32_t>(from[2 * stride]),
1653  numext::bit_cast<int32_t>(from[1 * stride]), numext::bit_cast<int32_t>(from[0 * stride]));
1654 }
1655 
1656 template <>
1658  return _mm_set_epi8(from[15 * stride], from[14 * stride], from[13 * stride], from[12 * stride], from[11 * stride],
1659  from[10 * stride], from[9 * stride], from[8 * stride], from[7 * stride], from[6 * stride],
1660  from[5 * stride], from[4 * stride], from[3 * stride], from[2 * stride], from[1 * stride],
1661  from[0 * stride]);
1662 }
1663 
1664 template <>
1665 EIGEN_STRONG_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
1666  to[stride * 0] = pfirst(from);
1667  to[stride * 1] = pfirst(_mm_shuffle_ps(from, from, 1));
1668  to[stride * 2] = pfirst(_mm_shuffle_ps(from, from, 2));
1669  to[stride * 3] = pfirst(_mm_shuffle_ps(from, from, 3));
1670 }
1671 template <>
1672 EIGEN_STRONG_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) {
1673  to[stride * 0] = pfirst(from);
1674  to[stride * 1] = pfirst(preverse(from));
1675 }
1676 template <>
1677 EIGEN_STRONG_INLINE void pscatter<int64_t, Packet2l>(int64_t* to, const Packet2l& from, Index stride) {
1678  to[stride * 0] = pfirst(from);
1679  to[stride * 1] = pfirst(preverse(from));
1680 }
1681 template <>
1682 EIGEN_STRONG_INLINE void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride) {
1683  to[stride * 0] = _mm_cvtsi128_si32(from);
1684  to[stride * 1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
1685  to[stride * 2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
1686  to[stride * 3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
1687 }
1688 template <>
1690  to[stride * 0] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(from));
1691  to[stride * 1] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1)));
1692  to[stride * 2] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2)));
1693  to[stride * 3] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3)));
1694 }
1695 template <>
1696 EIGEN_STRONG_INLINE void pscatter<bool, Packet16b>(bool* to, const Packet16b& from, Index stride) {
1697  to[4 * stride * 0] = _mm_cvtsi128_si32(from);
1698  to[4 * stride * 1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
1699  to[4 * stride * 2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
1700  to[4 * stride * 3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
1701 }
1702 
1703 // some compilers might be tempted to perform multiple moves instead of using a vector path.
1704 template <>
1705 EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a) {
1706  Packet4f pa = _mm_set_ss(a);
1707  pstore(to, Packet4f(vec4f_swizzle1(pa, 0, 0, 0, 0)));
1708 }
1709 // some compilers might be tempted to perform multiple moves instead of using a vector path.
1710 template <>
1711 EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double& a) {
1712  Packet2d pa = _mm_set_sd(a);
1713  pstore(to, Packet2d(vec2d_swizzle1(pa, 0, 0)));
1714 }
1715 
1716 #if EIGEN_COMP_PGI && EIGEN_COMP_PGI < 1900
1717 typedef const void* SsePrefetchPtrType;
1718 #else
1719 typedef const char* SsePrefetchPtrType;
1720 #endif
1721 
1722 #ifndef EIGEN_VECTORIZE_AVX
1723 template <>
1724 EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
1725  _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
1726 }
1727 template <>
1728 EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
1729  _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
1730 }
1731 template <>
1732 EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) {
1733  _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
1734 }
1735 template <>
1736 EIGEN_STRONG_INLINE void prefetch<int64_t>(const int64_t* addr) {
1737  _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
1738 }
1739 template <>
1741  _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
1742 }
1743 #endif
1744 
1745 template <>
1747  return pfrexp_generic(a, exponent);
1748 }
1749 
1750 // Extract exponent without existence of Packet2l.
1751 template <>
1753  const Packet2d cst_exp_mask = pset1frombits<Packet2d>(static_cast<uint64_t>(0x7ff0000000000000ull));
1754  __m128i a_expo = _mm_srli_epi64(_mm_castpd_si128(pand(a, cst_exp_mask)), 52);
1755  return _mm_cvtepi32_pd(vec4i_swizzle1(a_expo, 0, 2, 1, 3));
1756 }
1757 
1758 template <>
1760  return pfrexp_generic(a, exponent);
1761 }
1762 
1763 template <>
1765  return pldexp_generic(a, exponent);
1766 }
1767 
1768 // We specialize pldexp here, since the generic implementation uses Packet2l, which is not well
1769 // supported by SSE, and has more range than is needed for exponents.
1770 template <>
1772  // Clamp exponent to [-2099, 2099]
1773  const Packet2d max_exponent = pset1<Packet2d>(2099.0);
1774  const Packet2d e = pmin(pmax(exponent, pnegate(max_exponent)), max_exponent);
1775 
1776  // Convert e to integer and swizzle to low-order bits.
1777  const Packet4i ei = vec4i_swizzle1(_mm_cvtpd_epi32(e), 0, 3, 1, 3);
1778 
1779  // Split 2^e into four factors and multiply:
1780  const Packet4i bias = _mm_set_epi32(0, 1023, 0, 1023);
1781  Packet4i b = parithmetic_shift_right<2>(ei); // floor(e/4)
1782  Packet2d c = _mm_castsi128_pd(_mm_slli_epi64(padd(b, bias), 52)); // 2^b
1783  Packet2d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
1784  b = psub(psub(psub(ei, b), b), b); // e - 3b
1785  c = _mm_castsi128_pd(_mm_slli_epi64(padd(b, bias), 52)); // 2^(e - 3b)
1786  out = pmul(out, c); // a * 2^e
1787  return out;
1788 }
1789 
1790 // We specialize pldexp here, since the generic implementation uses Packet2l, which is not well
1791 // supported by SSE, and has more range than is needed for exponents.
1792 template <>
1794  // Clamp exponent to [-1023, 1024]
1795  const Packet2d min_exponent = pset1<Packet2d>(-1023.0);
1796  const Packet2d max_exponent = pset1<Packet2d>(1024.0);
1797  const Packet2d e = pmin(pmax(exponent, min_exponent), max_exponent);
1798 
1799  // Convert e to integer and swizzle to low-order bits.
1800  const Packet4i ei = vec4i_swizzle1(_mm_cvtpd_epi32(e), 0, 3, 1, 3);
1801 
1802  // Compute 2^e multiply:
1803  const Packet4i bias = _mm_set_epi32(0, 1023, 0, 1023);
1804  const Packet2d c = _mm_castsi128_pd(_mm_slli_epi64(padd(ei, bias), 52)); // 2^e
1805  return pmul(a, c);
1806 }
1807 
1808 // with AVX, the default implementations based on pload1 are faster
1809 #ifndef __AVX__
1810 template <>
1811 EIGEN_STRONG_INLINE void pbroadcast4<Packet4f>(const float* a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) {
1812  a3 = pload<Packet4f>(a);
1813  a0 = vec4f_swizzle1(a3, 0, 0, 0, 0);
1814  a1 = vec4f_swizzle1(a3, 1, 1, 1, 1);
1815  a2 = vec4f_swizzle1(a3, 2, 2, 2, 2);
1816  a3 = vec4f_swizzle1(a3, 3, 3, 3, 3);
1817 }
1818 template <>
1820  Packet2d& a3) {
1821 #ifdef EIGEN_VECTORIZE_SSE3
1822  a0 = _mm_loaddup_pd(a + 0);
1823  a1 = _mm_loaddup_pd(a + 1);
1824  a2 = _mm_loaddup_pd(a + 2);
1825  a3 = _mm_loaddup_pd(a + 3);
1826 #else
1827  a1 = pload<Packet2d>(a);
1828  a0 = vec2d_swizzle1(a1, 0, 0);
1829  a1 = vec2d_swizzle1(a1, 1, 1);
1830  a3 = pload<Packet2d>(a + 2);
1831  a2 = vec2d_swizzle1(a3, 0, 0);
1832  a3 = vec2d_swizzle1(a3, 1, 1);
1833 #endif
1834 }
1835 #endif
1836 
1838  vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55));
1839  vecs[2] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xAA));
1840  vecs[3] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xFF));
1841  vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00));
1842 }
1843 
1844 template <>
1846  // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures
1847  // (from Nehalem to Haswell)
1848  // #ifdef EIGEN_VECTORIZE_SSE3
1849  // Packet4f tmp = _mm_add_ps(a, vec4f_swizzle1(a,2,3,2,3));
1850  // return pfirst<Packet4f>(_mm_hadd_ps(tmp, tmp));
1851  // #else
1852  Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a, a));
1853  return pfirst<Packet4f>(_mm_add_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1)));
1854  // #endif
1855 }
1856 
1857 template <>
1859  // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures
1860  // (from Nehalem to Haswell)
1861  // #ifdef EIGEN_VECTORIZE_SSE3
1862  // return pfirst<Packet2d>(_mm_hadd_pd(a, a));
1863  // #else
1864  return pfirst<Packet2d>(_mm_add_sd(a, _mm_unpackhi_pd(a, a)));
1865  // #endif
1866 }
1867 
1868 template <>
1870  return pfirst<Packet2l>(_mm_add_epi64(a, _mm_unpackhi_epi64(a, a)));
1871 }
1872 
1873 #ifdef EIGEN_VECTORIZE_SSSE3
1874 template <>
1876  Packet4i tmp0 = _mm_hadd_epi32(a, a);
1877  return pfirst<Packet4i>(_mm_hadd_epi32(tmp0, tmp0));
1878 }
1879 template <>
1881  Packet4ui tmp0 = _mm_hadd_epi32(a, a);
1882  return pfirst<Packet4ui>(_mm_hadd_epi32(tmp0, tmp0));
1883 }
1884 #else
1885 template <>
1887  Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a, a));
1888  return pfirst(tmp) + pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1));
1889 }
1890 template <>
1892  Packet4ui tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a, a));
1893  return pfirst(tmp) + pfirst<Packet4ui>(_mm_shuffle_epi32(tmp, 1));
1894 }
1895 #endif
1896 
1897 template <>
1899  Packet4i tmp = _mm_or_si128(a, _mm_unpackhi_epi64(a, a));
1900  return (pfirst(tmp) != 0) || (pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1)) != 0);
1901 }
1902 
1903 // Other reduction functions:
1904 
1905 // mul
1906 template <>
1908  Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a, a));
1909  return pfirst<Packet4f>(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1)));
1910 }
1911 template <>
1913  return pfirst<Packet2d>(_mm_mul_sd(a, _mm_unpackhi_pd(a, a)));
1914 }
1915 template <>
1917  EIGEN_ALIGN16 int64_t aux[2];
1918  pstore(aux, a);
1919  return aux[0] * aux[1];
1920 }
1921 template <>
1923  // after some experiments, it is seems this is the fastest way to implement it
1924  // for GCC (e.g., reusing pmul is very slow!)
1925  // TODO try to call _mm_mul_epu32 directly
1926  EIGEN_ALIGN16 int aux[4];
1927  pstore(aux, a);
1928  return (aux[0] * aux[1]) * (aux[2] * aux[3]);
1929 }
1930 template <>
1932  // after some experiments, it is seems this is the fastest way to implement it
1933  // for GCC (eg., reusing pmul is very slow !)
1934  // TODO try to call _mm_mul_epu32 directly
1935  EIGEN_ALIGN16 uint32_t aux[4];
1936  pstore(aux, a);
1937  return (aux[0] * aux[1]) * (aux[2] * aux[3]);
1938 }
1939 
1940 template <>
1942  Packet4i tmp = _mm_and_si128(a, _mm_unpackhi_epi64(a, a));
1943  return ((pfirst<Packet4i>(tmp) == 0x01010101) && (pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1)) == 0x01010101));
1944 }
1945 
1946 // min
1947 template <>
1949  Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a, a));
1950  return pfirst<Packet4f>(_mm_min_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1)));
1951 }
1952 template <>
1954  return pfirst<Packet2d>(_mm_min_sd(a, _mm_unpackhi_pd(a, a)));
1955 }
1956 template <>
1958 #ifdef EIGEN_VECTORIZE_SSE4_1
1959  Packet4i tmp = _mm_min_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 2)));
1960  return pfirst<Packet4i>(_mm_min_epi32(tmp, _mm_shuffle_epi32(tmp, 1)));
1961 #else
1962  // after some experiments, it is seems this is the fastest way to implement it
1963  // for GCC (eg., it does not like using std::min after the pstore !!)
1964  EIGEN_ALIGN16 int aux[4];
1965  pstore(aux, a);
1966  int aux0 = aux[0] < aux[1] ? aux[0] : aux[1];
1967  int aux2 = aux[2] < aux[3] ? aux[2] : aux[3];
1968  return aux0 < aux2 ? aux0 : aux2;
1969 #endif // EIGEN_VECTORIZE_SSE4_1
1970 }
1971 template <>
1973 #ifdef EIGEN_VECTORIZE_SSE4_1
1974  Packet4ui tmp = _mm_min_epu32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 2)));
1975  return pfirst<Packet4ui>(_mm_min_epu32(tmp, _mm_shuffle_epi32(tmp, 1)));
1976 #else
1977  // after some experiments, it is seems this is the fastest way to implement it
1978  // for GCC (eg., it does not like using std::min after the pstore !!)
1979  EIGEN_ALIGN16 uint32_t aux[4];
1980  pstore(aux, a);
1981  uint32_t aux0 = aux[0] < aux[1] ? aux[0] : aux[1];
1982  uint32_t aux2 = aux[2] < aux[3] ? aux[2] : aux[3];
1983  return aux0 < aux2 ? aux0 : aux2;
1984 #endif // EIGEN_VECTORIZE_SSE4_1
1985 }
1986 
1987 // max
1988 template <>
1990  Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a, a));
1991  return pfirst<Packet4f>(_mm_max_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1)));
1992 }
1993 template <>
1995  return pfirst<Packet2d>(_mm_max_sd(a, _mm_unpackhi_pd(a, a)));
1996 }
1997 template <>
1999 #ifdef EIGEN_VECTORIZE_SSE4_1
2000  Packet4i tmp = _mm_max_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 2)));
2001  return pfirst<Packet4i>(_mm_max_epi32(tmp, _mm_shuffle_epi32(tmp, 1)));
2002 #else
2003  // after some experiments, it is seems this is the fastest way to implement it
2004  // for GCC (eg., it does not like using std::min after the pstore !!)
2005  EIGEN_ALIGN16 int aux[4];
2006  pstore(aux, a);
2007  int aux0 = aux[0] > aux[1] ? aux[0] : aux[1];
2008  int aux2 = aux[2] > aux[3] ? aux[2] : aux[3];
2009  return aux0 > aux2 ? aux0 : aux2;
2010 #endif // EIGEN_VECTORIZE_SSE4_1
2011 }
2012 template <>
2014 #ifdef EIGEN_VECTORIZE_SSE4_1
2015  Packet4ui tmp = _mm_max_epu32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 2)));
2016  return pfirst<Packet4ui>(_mm_max_epu32(tmp, _mm_shuffle_epi32(tmp, 1)));
2017 #else
2018  // after some experiments, it is seems this is the fastest way to implement it
2019  // for GCC (eg., it does not like using std::min after the pstore !!)
2020  EIGEN_ALIGN16 uint32_t aux[4];
2021  pstore(aux, a);
2022  uint32_t aux0 = aux[0] > aux[1] ? aux[0] : aux[1];
2023  uint32_t aux2 = aux[2] > aux[3] ? aux[2] : aux[3];
2024  return aux0 > aux2 ? aux0 : aux2;
2025 #endif // EIGEN_VECTORIZE_SSE4_1
2026 }
2027 
2028 // not needed yet
2029 // template<> EIGEN_STRONG_INLINE bool predux_all(const Packet4f& x)
2030 // {
2031 // return _mm_movemask_ps(x) == 0xF;
2032 // }
2033 
2034 template <>
2036  return _mm_movemask_pd(x) != 0x0;
2037 }
2038 
2039 template <>
2041  return _mm_movemask_ps(x) != 0x0;
2042 }
2043 
2044 template <>
2046  return _mm_movemask_pd(_mm_castsi128_pd(x)) != 0x0;
2047 }
2048 
2049 template <>
2051  return _mm_movemask_ps(_mm_castsi128_ps(x)) != 0x0;
2052 }
2053 template <>
2055  return _mm_movemask_ps(_mm_castsi128_ps(x)) != 0x0;
2056 }
2057 
2058 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
2059  _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]);
2060 }
2061 
2062 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
2063  __m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]);
2064  kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]);
2065  kernel.packet[1] = tmp;
2066 }
2067 
2068 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2l, 2>& kernel) {
2069  __m128i tmp = _mm_unpackhi_epi64(kernel.packet[0], kernel.packet[1]);
2070  kernel.packet[0] = _mm_unpacklo_epi64(kernel.packet[0], kernel.packet[1]);
2071  kernel.packet[1] = tmp;
2072 }
2073 
2074 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
2075  __m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
2076  __m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]);
2077  __m128i T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]);
2078  __m128i T3 = _mm_unpackhi_epi32(kernel.packet[2], kernel.packet[3]);
2079 
2080  kernel.packet[0] = _mm_unpacklo_epi64(T0, T1);
2081  kernel.packet[1] = _mm_unpackhi_epi64(T0, T1);
2082  kernel.packet[2] = _mm_unpacklo_epi64(T2, T3);
2083  kernel.packet[3] = _mm_unpackhi_epi64(T2, T3);
2084 }
2085 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4ui, 4>& kernel) {
2086  ptranspose((PacketBlock<Packet4i, 4>&)kernel);
2087 }
2088 
2090  __m128i T0 = _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]);
2091  __m128i T1 = _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]);
2092  __m128i T2 = _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]);
2093  __m128i T3 = _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]);
2094  kernel.packet[0] = _mm_unpacklo_epi16(T0, T2);
2095  kernel.packet[1] = _mm_unpackhi_epi16(T0, T2);
2096  kernel.packet[2] = _mm_unpacklo_epi16(T1, T3);
2097  kernel.packet[3] = _mm_unpackhi_epi16(T1, T3);
2098 }
2099 
2101  // If we number the elements in the input thus:
2102  // kernel.packet[ 0] = {00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 0a, 0b, 0c, 0d, 0e, 0f}
2103  // kernel.packet[ 1] = {10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 1a, 1b, 1c, 1d, 1e, 1f}
2104  // ...
2105  // kernel.packet[15] = {f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, fa, fb, fc, fd, fe, ff},
2106  //
2107  // the desired output is:
2108  // kernel.packet[ 0] = {00, 10, 20, 30, 40, 50, 60, 70, 80, 90, a0, b0, c0, d0, e0, f0}
2109  // kernel.packet[ 1] = {01, 11, 21, 31, 41, 51, 61, 71, 81, 91, a1, b1, c1, d1, e1, f1}
2110  // ...
2111  // kernel.packet[15] = {0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f, 9f, af, bf, cf, df, ef, ff},
2112  __m128i t0 =
2113  _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
2114  __m128i t1 =
2115  _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]); // 08 18 09 19 0a 1a 0b 1b 0c 1c 0d 1d 0e 1e 0f 1f
2116  __m128i t2 =
2117  _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]); // 20 30 21 31 22 32 ... 27 37
2118  __m128i t3 =
2119  _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]); // 28 38 29 39 2a 3a ... 2f 3f
2120  __m128i t4 =
2121  _mm_unpacklo_epi8(kernel.packet[4], kernel.packet[5]); // 40 50 41 51 42 52 47 57
2122  __m128i t5 = _mm_unpackhi_epi8(kernel.packet[4], kernel.packet[5]); // 48 58 49 59 4a 5a
2123  __m128i t6 = _mm_unpacklo_epi8(kernel.packet[6], kernel.packet[7]);
2124  __m128i t7 = _mm_unpackhi_epi8(kernel.packet[6], kernel.packet[7]);
2125  __m128i t8 = _mm_unpacklo_epi8(kernel.packet[8], kernel.packet[9]);
2126  __m128i t9 = _mm_unpackhi_epi8(kernel.packet[8], kernel.packet[9]);
2127  __m128i ta = _mm_unpacklo_epi8(kernel.packet[10], kernel.packet[11]);
2128  __m128i tb = _mm_unpackhi_epi8(kernel.packet[10], kernel.packet[11]);
2129  __m128i tc = _mm_unpacklo_epi8(kernel.packet[12], kernel.packet[13]);
2130  __m128i td = _mm_unpackhi_epi8(kernel.packet[12], kernel.packet[13]);
2131  __m128i te = _mm_unpacklo_epi8(kernel.packet[14], kernel.packet[15]);
2132  __m128i tf = _mm_unpackhi_epi8(kernel.packet[14], kernel.packet[15]);
2133 
2134  __m128i s0 = _mm_unpacklo_epi16(t0, t2); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
2135  __m128i s1 = _mm_unpackhi_epi16(t0, t2); // 04 14 24 34
2136  __m128i s2 = _mm_unpacklo_epi16(t1, t3); // 08 18 28 38 ...
2137  __m128i s3 = _mm_unpackhi_epi16(t1, t3); // 0c 1c 2c 3c ...
2138  __m128i s4 = _mm_unpacklo_epi16(t4, t6); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
2139  __m128i s5 = _mm_unpackhi_epi16(t4, t6); // 44 54 64 74 ...
2140  __m128i s6 = _mm_unpacklo_epi16(t5, t7);
2141  __m128i s7 = _mm_unpackhi_epi16(t5, t7);
2142  __m128i s8 = _mm_unpacklo_epi16(t8, ta);
2143  __m128i s9 = _mm_unpackhi_epi16(t8, ta);
2144  __m128i sa = _mm_unpacklo_epi16(t9, tb);
2145  __m128i sb = _mm_unpackhi_epi16(t9, tb);
2146  __m128i sc = _mm_unpacklo_epi16(tc, te);
2147  __m128i sd = _mm_unpackhi_epi16(tc, te);
2148  __m128i se = _mm_unpacklo_epi16(td, tf);
2149  __m128i sf = _mm_unpackhi_epi16(td, tf);
2150 
2151  __m128i u0 = _mm_unpacklo_epi32(s0, s4); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
2152  __m128i u1 = _mm_unpackhi_epi32(s0, s4); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
2153  __m128i u2 = _mm_unpacklo_epi32(s1, s5);
2154  __m128i u3 = _mm_unpackhi_epi32(s1, s5);
2155  __m128i u4 = _mm_unpacklo_epi32(s2, s6);
2156  __m128i u5 = _mm_unpackhi_epi32(s2, s6);
2157  __m128i u6 = _mm_unpacklo_epi32(s3, s7);
2158  __m128i u7 = _mm_unpackhi_epi32(s3, s7);
2159  __m128i u8 = _mm_unpacklo_epi32(s8, sc);
2160  __m128i u9 = _mm_unpackhi_epi32(s8, sc);
2161  __m128i ua = _mm_unpacklo_epi32(s9, sd);
2162  __m128i ub = _mm_unpackhi_epi32(s9, sd);
2163  __m128i uc = _mm_unpacklo_epi32(sa, se);
2164  __m128i ud = _mm_unpackhi_epi32(sa, se);
2165  __m128i ue = _mm_unpacklo_epi32(sb, sf);
2166  __m128i uf = _mm_unpackhi_epi32(sb, sf);
2167 
2168  kernel.packet[0] = _mm_unpacklo_epi64(u0, u8);
2169  kernel.packet[1] = _mm_unpackhi_epi64(u0, u8);
2170  kernel.packet[2] = _mm_unpacklo_epi64(u1, u9);
2171  kernel.packet[3] = _mm_unpackhi_epi64(u1, u9);
2172  kernel.packet[4] = _mm_unpacklo_epi64(u2, ua);
2173  kernel.packet[5] = _mm_unpackhi_epi64(u2, ua);
2174  kernel.packet[6] = _mm_unpacklo_epi64(u3, ub);
2175  kernel.packet[7] = _mm_unpackhi_epi64(u3, ub);
2176  kernel.packet[8] = _mm_unpacklo_epi64(u4, uc);
2177  kernel.packet[9] = _mm_unpackhi_epi64(u4, uc);
2178  kernel.packet[10] = _mm_unpacklo_epi64(u5, ud);
2179  kernel.packet[11] = _mm_unpackhi_epi64(u5, ud);
2180  kernel.packet[12] = _mm_unpacklo_epi64(u6, ue);
2181  kernel.packet[13] = _mm_unpackhi_epi64(u6, ue);
2182  kernel.packet[14] = _mm_unpacklo_epi64(u7, uf);
2183  kernel.packet[15] = _mm_unpackhi_epi64(u7, uf);
2184 }
2185 
2187  return _mm_set_epi64x(0 - ifPacket.select[1], 0 - ifPacket.select[0]);
2188 }
2189 
2191  return _mm_set_epi32(0 - ifPacket.select[3], 0 - ifPacket.select[2], 0 - ifPacket.select[1], 0 - ifPacket.select[0]);
2192 }
2193 
2194 template <>
2195 EIGEN_STRONG_INLINE Packet2l pblend(const Selector<2>& ifPacket, const Packet2l& thenPacket,
2196  const Packet2l& elsePacket) {
2197  const __m128i true_mask = sse_blend_mask(ifPacket);
2198  return pselect<Packet2l>(true_mask, thenPacket, elsePacket);
2199 }
2200 template <>
2201 EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket,
2202  const Packet4i& elsePacket) {
2203  const __m128i true_mask = sse_blend_mask(ifPacket);
2204  return pselect<Packet4i>(true_mask, thenPacket, elsePacket);
2205 }
2206 template <>
2207 EIGEN_STRONG_INLINE Packet4ui pblend(const Selector<4>& ifPacket, const Packet4ui& thenPacket,
2208  const Packet4ui& elsePacket) {
2209  return (Packet4ui)pblend(ifPacket, (Packet4i)thenPacket, (Packet4i)elsePacket);
2210 }
2211 template <>
2212 EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
2213  const Packet4f& elsePacket) {
2214  const __m128i true_mask = sse_blend_mask(ifPacket);
2215  return pselect<Packet4f>(_mm_castsi128_ps(true_mask), thenPacket, elsePacket);
2216 }
2217 template <>
2218 EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket,
2219  const Packet2d& elsePacket) {
2220  const __m128i true_mask = sse_blend_mask(ifPacket);
2221  return pselect<Packet2d>(_mm_castsi128_pd(true_mask), thenPacket, elsePacket);
2222 }
2223 
2224 // Scalar path for pmadd with FMA to ensure consistency with vectorized path.
2225 #ifdef EIGEN_VECTORIZE_FMA
2226 template <>
2227 EIGEN_STRONG_INLINE float pmadd(const float& a, const float& b, const float& c) {
2228  return ::fmaf(a, b, c);
2229 }
2230 template <>
2231 EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, const double& c) {
2232  return ::fma(a, b, c);
2233 }
2234 template <>
2235 EIGEN_STRONG_INLINE float pmsub(const float& a, const float& b, const float& c) {
2236  return ::fmaf(a, b, -c);
2237 }
2238 template <>
2239 EIGEN_STRONG_INLINE double pmsub(const double& a, const double& b, const double& c) {
2240  return ::fma(a, b, -c);
2241 }
2242 template <>
2243 EIGEN_STRONG_INLINE float pnmadd(const float& a, const float& b, const float& c) {
2244  return ::fmaf(-a, b, c);
2245 }
2246 template <>
2247 EIGEN_STRONG_INLINE double pnmadd(const double& a, const double& b, const double& c) {
2248  return ::fma(-a, b, c);
2249 }
2250 template <>
2251 EIGEN_STRONG_INLINE float pnmsub(const float& a, const float& b, const float& c) {
2252  return ::fmaf(-a, b, -c);
2253 }
2254 template <>
2255 EIGEN_STRONG_INLINE double pnmsub(const double& a, const double& b, const double& c) {
2256  return ::fma(-a, b, -c);
2257 }
2258 #endif
2259 
2260 #ifdef EIGEN_VECTORIZE_SSE4_1
2261 // Helpers for half->float and float->half conversions.
2262 // Currently only used by the AVX code.
2263 EIGEN_STRONG_INLINE __m128i half2floatsse(__m128i h) {
2264  __m128i input = _mm_cvtepu16_epi32(h);
2265 
2266  // Direct vectorization of half_to_float, C parts in the comments.
2267  __m128i shifted_exp = _mm_set1_epi32(0x7c00 << 13);
2268  // o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits
2269  __m128i ou = _mm_slli_epi32(_mm_and_si128(input, _mm_set1_epi32(0x7fff)), 13);
2270  // exp = shifted_exp & o.u; // just the exponent
2271  __m128i exp = _mm_and_si128(ou, shifted_exp);
2272  // o.u += (127 - 15) << 23;
2273  ou = _mm_add_epi32(ou, _mm_set1_epi32((127 - 15) << 23));
2274 
2275  // Inf/NaN?
2276  __m128i naninf_mask = _mm_cmpeq_epi32(exp, shifted_exp);
2277  // Inf/NaN adjust
2278  __m128i naninf_adj = _mm_and_si128(_mm_set1_epi32((128 - 16) << 23), naninf_mask);
2279  // extra exp adjust for Inf/NaN
2280  ou = _mm_add_epi32(ou, naninf_adj);
2281 
2282  // Zero/Denormal?
2283  __m128i zeroden_mask = _mm_cmpeq_epi32(exp, _mm_setzero_si128());
2284  __m128i zeroden_adj = _mm_and_si128(zeroden_mask, _mm_set1_epi32(1 << 23));
2285  // o.u += 1 << 23;
2286  ou = _mm_add_epi32(ou, zeroden_adj);
2287  // magic.u = 113 << 23
2288  __m128i magic = _mm_and_si128(zeroden_mask, _mm_set1_epi32(113 << 23));
2289  // o.f -= magic.f
2290  ou = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(ou), _mm_castsi128_ps(magic)));
2291 
2292  __m128i sign = _mm_slli_epi32(_mm_and_si128(input, _mm_set1_epi32(0x8000)), 16);
2293  // o.u |= (h.x & 0x8000) << 16; // sign bit
2294  ou = _mm_or_si128(ou, sign);
2295  // return o.f;
2296  // We are actually returning uint version, to make
2297  // _mm256_insertf128_si256 work.
2298  return ou;
2299 }
2300 
2301 EIGEN_STRONG_INLINE __m128i float2half(__m128 f) {
2302  // unsigned int sign_mask = 0x80000000u;
2303  __m128i sign = _mm_set1_epi32(0x80000000u);
2304  // unsigned int sign = f.u & sign_mask;
2305  sign = _mm_and_si128(sign, _mm_castps_si128(f));
2306  // f.u ^= sign;
2307  f = _mm_xor_ps(f, _mm_castsi128_ps(sign));
2308 
2309  __m128i fu = _mm_castps_si128(f);
2310 
2311  __m128i f16max = _mm_set1_epi32((127 + 16) << 23);
2312  __m128i f32infty = _mm_set1_epi32(255 << 23);
2313  // if (f.u >= f16max.u) // result is Inf or NaN (all exponent bits set)
2314  // there is no _mm_cmpge_epi32, so use lt and swap operands
2315  __m128i infnan_mask = _mm_cmplt_epi32(f16max, _mm_castps_si128(f));
2316  __m128i inf_mask = _mm_cmpgt_epi32(_mm_castps_si128(f), f32infty);
2317  __m128i nan_mask = _mm_andnot_si128(inf_mask, infnan_mask);
2318  __m128i inf_value = _mm_and_si128(inf_mask, _mm_set1_epi32(0x7e00));
2319  __m128i nan_value = _mm_and_si128(nan_mask, _mm_set1_epi32(0x7c00));
2320  // o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
2321  __m128i naninf_value = _mm_or_si128(inf_value, nan_value);
2322 
2323  __m128i denorm_magic = _mm_set1_epi32(((127 - 15) + (23 - 10) + 1) << 23);
2324  __m128i subnorm_mask = _mm_cmplt_epi32(_mm_castps_si128(f), _mm_set1_epi32(113 << 23));
2325  // f.f += denorm_magic.f;
2326  f = _mm_add_ps(f, _mm_castsi128_ps(denorm_magic));
2327  // f.u - denorm_magic.u
2328  __m128i o = _mm_sub_epi32(_mm_castps_si128(f), denorm_magic);
2329  o = _mm_and_si128(o, subnorm_mask);
2330  // Correct result for inf/nan/zero/subnormal, 0 otherwise
2331  o = _mm_or_si128(o, naninf_value);
2332 
2333  __m128i mask = _mm_or_si128(infnan_mask, subnorm_mask);
2334  o = _mm_and_si128(o, mask);
2335 
2336  // mant_odd = (f.u >> 13) & 1;
2337  __m128i mand_odd = _mm_and_si128(_mm_srli_epi32(fu, 13), _mm_set1_epi32(0x1));
2338  // f.u += 0xc8000fffU;
2339  fu = _mm_add_epi32(fu, _mm_set1_epi32(0xc8000fffU));
2340  // f.u += mant_odd;
2341  fu = _mm_add_epi32(fu, mand_odd);
2342  fu = _mm_andnot_si128(mask, fu);
2343  // f.u >> 13
2344  fu = _mm_srli_epi32(fu, 13);
2345  o = _mm_or_si128(fu, o);
2346 
2347  // o.x |= static_cast<numext::uint16_t>(sign >> 16);
2348  o = _mm_or_si128(o, _mm_srli_epi32(sign, 16));
2349 
2350  // 16 bit values
2351  return _mm_and_si128(o, _mm_set1_epi32(0xffff));
2352 }
2353 #endif
2354 
2355 // Packet math for Eigen::half
2356 // Disable the following code since it's broken on too many platforms / compilers.
2357 // #elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
2358 #if 0
2359 
2360 typedef struct {
2361  __m64 x;
2362 } Packet4h;
2363 
2364 
2365 template<> struct is_arithmetic<Packet4h> { enum { value = true }; };
2366 
2367 template <>
2368 struct packet_traits<Eigen::half> : default_packet_traits {
2369  typedef Packet4h type;
2370  // There is no half-size packet for Packet4h.
2371  typedef Packet4h half;
2372  enum {
2373  Vectorizable = 1,
2374  AlignedOnScalar = 1,
2375  size = 4,
2376  HasAdd = 1,
2377  HasSub = 1,
2378  HasMul = 1,
2379  HasDiv = 1,
2380  HasNegate = 0,
2381  HasAbs = 0,
2382  HasAbs2 = 0,
2383  HasMin = 0,
2384  HasMax = 0,
2385  HasConj = 0,
2386  HasSetLinear = 0,
2387  };
2388 };
2389 
2390 
2391 template<> struct unpacket_traits<Packet4h> { typedef Eigen::half type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4h half; };
2392 
2393 template<> EIGEN_STRONG_INLINE Packet4h pset1<Packet4h>(const Eigen::half& from) {
2394  Packet4h result;
2395  result.x = _mm_set1_pi16(from.x);
2396  return result;
2397 }
2398 
2399 template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h>(const Packet4h& from) {
2400  return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_cvtsi64_si32(from.x)));
2401 }
2402 
2403 template<> EIGEN_STRONG_INLINE Packet4h pconj(const Packet4h& a) { return a; }
2404 
2405 template<> EIGEN_STRONG_INLINE Packet4h padd<Packet4h>(const Packet4h& a, const Packet4h& b) {
2406  __int64_t a64 = _mm_cvtm64_si64(a.x);
2407  __int64_t b64 = _mm_cvtm64_si64(b.x);
2408 
2409  Eigen::half h[4];
2410 
2411  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
2412  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
2413  h[0] = ha + hb;
2414  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
2415  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
2416  h[1] = ha + hb;
2417  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
2418  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
2419  h[2] = ha + hb;
2420  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
2421  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
2422  h[3] = ha + hb;
2423  Packet4h result;
2424  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
2425  return result;
2426 }
2427 
2428 template<> EIGEN_STRONG_INLINE Packet4h psub<Packet4h>(const Packet4h& a, const Packet4h& b) {
2429  __int64_t a64 = _mm_cvtm64_si64(a.x);
2430  __int64_t b64 = _mm_cvtm64_si64(b.x);
2431 
2432  Eigen::half h[4];
2433 
2434  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
2435  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
2436  h[0] = ha - hb;
2437  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
2438  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
2439  h[1] = ha - hb;
2440  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
2441  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
2442  h[2] = ha - hb;
2443  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
2444  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
2445  h[3] = ha - hb;
2446  Packet4h result;
2447  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
2448  return result;
2449 }
2450 
2451 template<> EIGEN_STRONG_INLINE Packet4h pmul<Packet4h>(const Packet4h& a, const Packet4h& b) {
2452  __int64_t a64 = _mm_cvtm64_si64(a.x);
2453  __int64_t b64 = _mm_cvtm64_si64(b.x);
2454 
2455  Eigen::half h[4];
2456 
2457  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
2458  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
2459  h[0] = ha * hb;
2460  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
2461  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
2462  h[1] = ha * hb;
2463  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
2464  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
2465  h[2] = ha * hb;
2466  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
2467  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
2468  h[3] = ha * hb;
2469  Packet4h result;
2470  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
2471  return result;
2472 }
2473 
2474 template<> EIGEN_STRONG_INLINE Packet4h pdiv<Packet4h>(const Packet4h& a, const Packet4h& b) {
2475  __int64_t a64 = _mm_cvtm64_si64(a.x);
2476  __int64_t b64 = _mm_cvtm64_si64(b.x);
2477 
2478  Eigen::half h[4];
2479 
2480  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
2481  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
2482  h[0] = ha / hb;
2483  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
2484  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
2485  h[1] = ha / hb;
2486  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
2487  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
2488  h[2] = ha / hb;
2489  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
2490  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
2491  h[3] = ha / hb;
2492  Packet4h result;
2493  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
2494  return result;
2495 }
2496 
2497 template<> EIGEN_STRONG_INLINE Packet4h pload<Packet4h>(const Eigen::half* from) {
2498  Packet4h result;
2499  result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
2500  return result;
2501 }
2502 
2503 template<> EIGEN_STRONG_INLINE Packet4h ploadu<Packet4h>(const Eigen::half* from) {
2504  Packet4h result;
2505  result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
2506  return result;
2507 }
2508 
2509 template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4h& from) {
2510  __int64_t r = _mm_cvtm64_si64(from.x);
2511  *(reinterpret_cast<__int64_t*>(to)) = r;
2512 }
2513 
2514 template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4h& from) {
2515  __int64_t r = _mm_cvtm64_si64(from.x);
2516  *(reinterpret_cast<__int64_t*>(to)) = r;
2517 }
2518 
2519 template<> EIGEN_STRONG_INLINE Packet4h
2520 ploadquad<Packet4h>(const Eigen::half* from) {
2521  return pset1<Packet4h>(*from);
2522 }
2523 
2524 template<> EIGEN_STRONG_INLINE Packet4h pgather<Eigen::half, Packet4h>(const Eigen::half* from, Index stride)
2525 {
2526  Packet4h result;
2527  result.x = _mm_set_pi16(from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
2528  return result;
2529 }
2530 
2531 template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h>(Eigen::half* to, const Packet4h& from, Index stride)
2532 {
2533  __int64_t a = _mm_cvtm64_si64(from.x);
2534  to[stride*0].x = static_cast<unsigned short>(a);
2535  to[stride*1].x = static_cast<unsigned short>(a >> 16);
2536  to[stride*2].x = static_cast<unsigned short>(a >> 32);
2537  to[stride*3].x = static_cast<unsigned short>(a >> 48);
2538 }
2539 
2541 ptranspose(PacketBlock<Packet4h,4>& kernel) {
2542  __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x);
2543  __m64 T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x);
2544  __m64 T2 = _mm_unpackhi_pi16(kernel.packet[0].x, kernel.packet[1].x);
2545  __m64 T3 = _mm_unpackhi_pi16(kernel.packet[2].x, kernel.packet[3].x);
2546 
2547  kernel.packet[0].x = _mm_unpacklo_pi32(T0, T1);
2548  kernel.packet[1].x = _mm_unpackhi_pi32(T0, T1);
2549  kernel.packet[2].x = _mm_unpacklo_pi32(T2, T3);
2550  kernel.packet[3].x = _mm_unpackhi_pi32(T2, T3);
2551 }
2552 
2553 #endif
2554 
2555 } // end namespace internal
2556 
2557 } // end namespace Eigen
2558 
2559 #if EIGEN_COMP_PGI && EIGEN_COMP_PGI < 1900
2560 // PGI++ does not define the following intrinsics in C++ mode.
2561 static inline __m128 _mm_castpd_ps(__m128d x) { return reinterpret_cast<__m128&>(x); }
2562 static inline __m128i _mm_castpd_si128(__m128d x) { return reinterpret_cast<__m128i&>(x); }
2563 static inline __m128d _mm_castps_pd(__m128 x) { return reinterpret_cast<__m128d&>(x); }
2564 static inline __m128i _mm_castps_si128(__m128 x) { return reinterpret_cast<__m128i&>(x); }
2565 static inline __m128 _mm_castsi128_ps(__m128i x) { return reinterpret_cast<__m128&>(x); }
2566 static inline __m128d _mm_castsi128_pd(__m128i x) { return reinterpret_cast<__m128d&>(x); }
2567 #endif
2568 
2569 #endif // EIGEN_PACKET_MATH_SSE_H
#define EIGEN_ALIGN16
Definition: ConfigureVectorization.h:142
Array< double, 1, 3 > e(1./3., 0.5, 2.)
#define EIGEN_DEBUG_ALIGNED_STORE
Definition: GenericPacketMath.h:38
#define EIGEN_DEBUG_ALIGNED_LOAD
Definition: GenericPacketMath.h:30
#define EIGEN_DEBUG_UNALIGNED_STORE
Definition: GenericPacketMath.h:42
#define EIGEN_DEBUG_UNALIGNED_LOAD
Definition: GenericPacketMath.h:34
#define EIGEN_ALWAYS_INLINE
Definition: Macros.h:845
#define EIGEN_FAST_MATH
Definition: Macros.h:51
#define EIGEN_STRONG_INLINE
Definition: Macros.h:834
cout<< "Here is the matrix m:"<< endl<< m<< endl;Matrix< ptrdiff_t, 3, 1 > res
Definition: PartialRedux_count.cpp:3
#define vec4ui_swizzle2(a, b, p, q, r, s)
Definition: SSE/PacketMath.h:112
#define vec2d_swizzle1(v, p, q)
Definition: SSE/PacketMath.h:102
#define vec4ui_swizzle1(v, p, q, r, s)
Definition: SSE/PacketMath.h:100
#define vec4i_swizzle1(v, p, q, r, s)
Definition: SSE/PacketMath.h:98
#define vec4i_swizzle2(a, b, p, q, r, s)
Definition: SSE/PacketMath.h:108
float * p
Definition: Tutorial_Map_using.cpp:9
Scalar * b
Definition: benchVecAdd.cpp:17
SCALAR Scalar
Definition: bench_gemm.cpp:45
@ N
Definition: constructor.cpp:22
static int f(const TensorMap< Tensor< int, 3 > > &tensor)
Definition: cxx11_tensor_map.cpp:237
@ Aligned16
Definition: Constants.h:237
RealScalar s
Definition: level1_cplx_impl.h:130
return int(ret)+1
const Scalar * a
Definition: level2_cplx_impl.h:32
const char const int const RealScalar const RealScalar * pa
Definition: level2_cplx_impl.h:20
int * m
Definition: level2_cplx_impl.h:294
char char * op
Definition: level2_impl.h:374
Eigen::Matrix< Scalar, Dynamic, Dynamic, ColMajor > tmp
Definition: level3_impl.h:365
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 exp(const bfloat16 &a)
Definition: BFloat16.h:615
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x)
Definition: Half.h:496
EIGEN_STRONG_INLINE Packet4f pandnot< Packet4f >(const Packet4f &a, const Packet4f &b)
Definition: AltiVec/PacketMath.h:1465
EIGEN_STRONG_INLINE Packet4ui psub< Packet4ui >(const Packet4ui &a, const Packet4ui &b)
Definition: LSX/PacketMath.h:634
__m128d Packet2d
Definition: LSX/PacketMath.h:36
EIGEN_STRONG_INLINE Packet4ui pset1< Packet4ui >(const uint32_t &from)
Definition: LSX/PacketMath.h:490
EIGEN_STRONG_INLINE void pstoreu< double >(double *to, const Packet4d &from)
Definition: AVX/PacketMath.h:1628
EIGEN_STRONG_INLINE Packet pminmax_propagate_numbers(const Packet &a, const Packet &b, Op op)
Definition: SSE/PacketMath.h:1118
EIGEN_STRONG_INLINE double predux< Packet2d >(const Packet2d &a)
Definition: LSX/PacketMath.h:1965
EIGEN_STRONG_INLINE Packet16b ptrue< Packet16b >(const Packet16b &)
Definition: SSE/PacketMath.h:773
EIGEN_STRONG_INLINE void pstoreu< uint32_t >(uint32_t *to, const Packet8ui &from)
Definition: AVX/PacketMath.h:1636
EIGEN_STRONG_INLINE Packet16b pgather< bool, Packet16b >(const bool *from, Index stride)
Definition: SSE/PacketMath.h:1657
EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf &a)
Definition: AltiVec/Complex.h:268
eigen_packet_wrapper< __m128i, 3 > Packet2l
Definition: LSX/PacketMath.h:41
EIGEN_STRONG_INLINE void pstore< bool >(bool *to, const Packet16b &from)
Definition: SSE/PacketMath.h:1470
EIGEN_STRONG_INLINE bool predux< Packet16b >(const Packet16b &a)
Definition: SSE/PacketMath.h:1898
EIGEN_STRONG_INLINE Packet ploads(const typename unpacket_traits< Packet >::type *from)
EIGEN_STRONG_INLINE void prefetch< uint32_t >(const uint32_t *addr)
Definition: AVX/PacketMath.h:1758
EIGEN_STRONG_INLINE Packet2l pandnot< Packet2l >(const Packet2l &a, const Packet2l &b)
Definition: LSX/PacketMath.h:1019
EIGEN_STRONG_INLINE int64_t predux< Packet2l >(const Packet2l &a)
Definition: LSX/PacketMath.h:1987
EIGEN_DEVICE_FUNC Packet padd(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:318
EIGEN_STRONG_INLINE Packet4f pmin< Packet4f >(const Packet4f &a, const Packet4f &b)
Definition: AltiVec/PacketMath.h:1250
EIGEN_STRONG_INLINE Packet16b ploaddup< Packet16b >(const bool *from)
Definition: SSE/PacketMath.h:1435
EIGEN_STRONG_INLINE Packet2d padd< Packet2d >(const Packet2d &a, const Packet2d &b)
Definition: LSX/PacketMath.h:605
EIGEN_STRONG_INLINE Packet2d pandnot< Packet2d >(const Packet2d &a, const Packet2d &b)
Definition: LSX/PacketMath.h:1003
EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f &a)
Definition: AVX/PacketMath.h:2283
EIGEN_STRONG_INLINE Packet16b pmul< Packet16b >(const Packet16b &a, const Packet16b &b)
Definition: SSE/PacketMath.h:666
EIGEN_STRONG_INLINE Packet8f pzero(const Packet8f &)
Definition: AVX/PacketMath.h:774
EIGEN_STRONG_INLINE uint32_t predux_max< Packet4ui >(const Packet4ui &a)
Definition: LSX/PacketMath.h:2166
__vector int Packet4i
Definition: AltiVec/PacketMath.h:34
EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f &a, const Packet4f &b)
Definition: LSX/PacketMath.h:132
EIGEN_ALWAYS_INLINE int64_t _mm_extract_epi64_0(const __m128i &a)
Definition: SSE/PacketMath.h:161
EIGEN_STRONG_INLINE Packet2l ploadu< Packet2l >(const int64_t *from)
Definition: LSX/PacketMath.h:1464
EIGEN_STRONG_INLINE Packet2d pmin< PropagateNaN, Packet2d >(const Packet2d &a, const Packet2d &b)
Definition: LSX/PacketMath.h:2729
EIGEN_STRONG_INLINE Packet4f padd< Packet4f >(const Packet4f &a, const Packet4f &b)
Definition: AltiVec/PacketMath.h:1066
EIGEN_STRONG_INLINE Packet4i por< Packet4i >(const Packet4i &a, const Packet4i &b)
Definition: AltiVec/PacketMath.h:1431
EIGEN_STRONG_INLINE Packet2d pmax< PropagateNumbers, Packet2d >(const Packet2d &a, const Packet2d &b)
Definition: SSE/PacketMath.h:1149
EIGEN_STRONG_INLINE Packet4ui pmul< Packet4ui >(const Packet4ui &a, const Packet4ui &b)
Definition: LSX/PacketMath.h:769
EIGEN_STRONG_INLINE Packet4i pset1< Packet4i >(const int &from)
Definition: AltiVec/PacketMath.h:778
EIGEN_STRONG_INLINE Packet2d paddsub< Packet2d >(const Packet2d &a, const Packet2d &b)
Definition: LSX/PacketMath.h:661
EIGEN_STRONG_INLINE Packet16b ploadquad< Packet16b >(const bool *from)
Definition: SSE/PacketMath.h:1443
EIGEN_STRONG_INLINE Packet16b psub< Packet16b >(const Packet16b &a, const Packet16b &b)
Definition: SSE/PacketMath.h:552
EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d &a, const Packet2d &b)
Definition: LSX/PacketMath.h:161
EIGEN_STRONG_INLINE Packet padds(const Packet &a, const Packet &b)
EIGEN_STRONG_INLINE float pfirst< Packet4f >(const Packet4f &a)
Definition: AltiVec/PacketMath.h:1863
EIGEN_STRONG_INLINE Packet2d pand< Packet2d >(const Packet2d &a, const Packet2d &b)
Definition: LSX/PacketMath.h:880
EIGEN_STRONG_INLINE Packet16b por< Packet16b >(const Packet16b &a, const Packet16b &b)
Definition: SSE/PacketMath.h:833
EIGEN_STRONG_INLINE void prefetch< int64_t >(const int64_t *addr)
Definition: LSX/PacketMath.h:1852
EIGEN_STRONG_INLINE Packet2d pldexp_fast< Packet2d >(const Packet2d &a, const Packet2d &exponent)
Definition: SSE/PacketMath.h:1793
EIGEN_STRONG_INLINE Packet4ui padd< Packet4ui >(const Packet4ui &a, const Packet4ui &b)
Definition: AltiVec/PacketMath.h:1074
EIGEN_STRONG_INLINE void ptranspose(PacketBlock< Packet2cf, 2 > &kernel)
Definition: AltiVec/Complex.h:339
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pgather< int64_t, Packet2l >(const int64_t *from, Index stride)
Definition: LSX/PacketMath.h:1669
EIGEN_STRONG_INLINE Packet4ui pand< Packet4ui >(const Packet4ui &a, const Packet4ui &b)
Definition: AltiVec/PacketMath.h:1414
EIGEN_STRONG_INLINE Packet4f ploadl< Packet4f >(const float *from)
Definition: SSE/PacketMath.h:1387
EIGEN_STRONG_INLINE Packet4i ploaddup< Packet4i >(const int *from)
Definition: AltiVec/PacketMath.h:1644
EIGEN_STRONG_INLINE bool predux_any(const Packet4f &x)
Definition: AltiVec/PacketMath.h:2751
EIGEN_STRONG_INLINE float predux_max< Packet4f >(const Packet4f &a)
Definition: AltiVec/PacketMath.h:2679
EIGEN_STRONG_INLINE Packet4f ploads< Packet4f >(const float *from)
Definition: SSE/PacketMath.h:1399
EIGEN_STRONG_INLINE Packet2d ploaddup< Packet2d >(const double *from)
Definition: LSX/PacketMath.h:1490
EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f &a, const Packet4f &b)
Definition: LSX/PacketMath.h:135
EIGEN_STRONG_INLINE Packet2d pxor< Packet2d >(const Packet2d &a, const Packet2d &b)
Definition: LSX/PacketMath.h:962
EIGEN_STRONG_INLINE Packet16b pload< Packet16b >(const bool *from)
Definition: SSE/PacketMath.h:1337
EIGEN_STRONG_INLINE Packet2d por< Packet2d >(const Packet2d &a, const Packet2d &b)
Definition: LSX/PacketMath.h:921
EIGEN_STRONG_INLINE void pscatter< bool, Packet16b >(bool *to, const Packet16b &from, Index stride)
Definition: SSE/PacketMath.h:1696
EIGEN_ALWAYS_INLINE int64_t _mm_extract_epi64_1(const __m128i &a)
Definition: SSE/PacketMath.h:164
EIGEN_STRONG_INLINE Packet2d pldexp< Packet2d >(const Packet2d &a, const Packet2d &exponent)
Definition: LSX/PacketMath.h:2753
EIGEN_STRONG_INLINE Packet4i pdiv< Packet4i >(const Packet4i &a, const Packet4i &b)
Definition: AltiVec/PacketMath.h:1205
EIGEN_STRONG_INLINE Packet2l padd< Packet2l >(const Packet2l &a, const Packet2l &b)
Definition: LSX/PacketMath.h:581
EIGEN_STRONG_INLINE Packet4f pmax< PropagateNumbers, Packet4f >(const Packet4f &a, const Packet4f &b)
Definition: SSE/PacketMath.h:1145
eigen_packet_wrapper< __m128i, 1 > Packet16b
Definition: SSE/PacketMath.h:53
EIGEN_STRONG_INLINE void pstore1< Packet2d >(double *to, const double &a)
Definition: SSE/PacketMath.h:1711
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter< int, Packet4i >(int *to, const Packet4i &from, Index stride)
Definition: AltiVec/PacketMath.h:959
EIGEN_STRONG_INLINE Packet4f ploaddup< Packet4f >(const float *from)
Definition: AltiVec/PacketMath.h:1640
EIGEN_STRONG_INLINE Packet4f por< Packet4f >(const Packet4f &a, const Packet4f &b)
Definition: AltiVec/PacketMath.h:1427
EIGEN_STRONG_INLINE Packet2l por< Packet2l >(const Packet2l &a, const Packet2l &b)
Definition: LSX/PacketMath.h:937
EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i &a)
Definition: AltiVec/PacketMath.h:1983
EIGEN_STRONG_INLINE Packet16b pset1< Packet16b >(const bool &from)
Definition: SSE/PacketMath.h:406
EIGEN_STRONG_INLINE int predux_min< Packet4i >(const Packet4i &a)
Definition: AltiVec/PacketMath.h:2604
EIGEN_STRONG_INLINE void pstorel(Scalar *to, const Packet &from)
EIGEN_STRONG_INLINE Packet4i pxor< Packet4i >(const Packet4i &a, const Packet4i &b)
Definition: AltiVec/PacketMath.h:1452
EIGEN_STRONG_INLINE double predux_max< Packet2d >(const Packet2d &a)
Definition: LSX/PacketMath.h:2127
EIGEN_STRONG_INLINE Packet4f pmul< Packet4f >(const Packet4f &a, const Packet4f &b)
Definition: AltiVec/PacketMath.h:1162
EIGEN_STRONG_INLINE void pstore1< Packet4f >(float *to, const float &a)
Definition: SSE/PacketMath.h:1705
EIGEN_STRONG_INLINE Packet4f pload1< Packet4f >(const float *from)
Definition: MSA/PacketMath.h:154
EIGEN_DEVICE_FUNC Packet pmax(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:663
EIGEN_STRONG_INLINE Packet4i pblend(const Selector< 4 > &ifPacket, const Packet4i &thenPacket, const Packet4i &elsePacket)
Definition: AltiVec/PacketMath.h:3075
EIGEN_STRONG_INLINE uint32_t predux< Packet4ui >(const Packet4ui &a)
Definition: LSX/PacketMath.h:2004
EIGEN_STRONG_INLINE Packet4i pcmp_eq< Packet4i >(const Packet4i &a, const Packet4i &b)
Definition: LSX/PacketMath.h:1147
EIGEN_STRONG_INLINE Packet4ui por< Packet4ui >(const Packet4ui &a, const Packet4ui &b)
Definition: LSX/PacketMath.h:949
EIGEN_STRONG_INLINE Packet4ui pmin< Packet4ui >(const Packet4ui &a, const Packet4ui &b)
Definition: LSX/PacketMath.h:1196
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4f pgather< float, Packet4f >(const float *from, Index stride)
Definition: AltiVec/PacketMath.h:853
EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f &a, const Packet4f &b)
Definition: AltiVec/PacketMath.h:1314
EIGEN_STRONG_INLINE Packet4f paddsub< Packet4f >(const Packet4f &a, const Packet4f &b)
Definition: LSX/PacketMath.h:653
EIGEN_STRONG_INLINE Packet2d pset1< Packet2d >(const double &from)
Definition: LSX/PacketMath.h:503
EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i &a)
Definition: AltiVec/PacketMath.h:1979
EIGEN_STRONG_INLINE Packet pminmax_propagate_nan(const Packet &a, const Packet &b, Op op)
Definition: SSE/PacketMath.h:1127
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter< uint32_t, Packet4ui >(uint32_t *to, const Packet4ui &from, Index stride)
Definition: LSX/PacketMath.h:1817
EIGEN_STRONG_INLINE void punpackp(Packet4f *vecs)
Definition: SSE/PacketMath.h:1837
EIGEN_STRONG_INLINE Packet4f pload< Packet4f >(const float *from)
Definition: AltiVec/PacketMath.h:492
EIGEN_STRONG_INLINE int predux_mul< Packet4i >(const Packet4i &a)
Definition: AltiVec/PacketMath.h:2529
EIGEN_STRONG_INLINE void pstore< int >(int *to, const Packet4i &from)
Definition: AltiVec/PacketMath.h:647
EIGEN_STRONG_INLINE Packet8h por(const Packet8h &a, const Packet8h &b)
Definition: AVX/PacketMath.h:2309
EIGEN_STRONG_INLINE Packet2d ptrue< Packet2d >(const Packet2d &a)
Definition: SSE/PacketMath.h:782
EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i &a, const Packet4i &b)
Definition: AltiVec/PacketMath.h:1341
__vector unsigned int Packet4ui
Definition: AltiVec/PacketMath.h:35
EIGEN_STRONG_INLINE Packet2d pmax< PropagateNaN, Packet2d >(const Packet2d &a, const Packet2d &b)
Definition: LSX/PacketMath.h:2733
EIGEN_STRONG_INLINE Packet4f pmin< PropagateNaN, Packet4f >(const Packet4f &a, const Packet4f &b)
Definition: LSX/PacketMath.h:2695
EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf &a)
Definition: AltiVec/Complex.h:303
EIGEN_STRONG_INLINE void pstore< double >(double *to, const Packet4d &from)
Definition: AVX/PacketMath.h:1611
EIGEN_STRONG_INLINE Packet4i padd< Packet4i >(const Packet4i &a, const Packet4i &b)
Definition: AltiVec/PacketMath.h:1070
EIGEN_STRONG_INLINE Packet4f pfloor< Packet4f >(const Packet4f &a)
Definition: AltiVec/PacketMath.h:1497
EIGEN_STRONG_INLINE uint32_t pfirst< Packet4ui >(const Packet4ui &a)
Definition: LSX/PacketMath.h:1910
EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
Definition: AltiVec/PacketMath.h:1218
EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf &a, const Packet4cf &b)
Definition: AVX/Complex.h:88
EIGEN_STRONG_INLINE Packet4i pandnot< Packet4i >(const Packet4i &a, const Packet4i &b)
Definition: AltiVec/PacketMath.h:1469
EIGEN_STRONG_INLINE Packet16b pand< Packet16b >(const Packet16b &a, const Packet16b &b)
Definition: SSE/PacketMath.h:808
EIGEN_STRONG_INLINE Packet4ui pandnot< Packet4ui >(const Packet4ui &a, const Packet4ui &b)
Definition: LSX/PacketMath.h:1031
EIGEN_STRONG_INLINE Packet8h ptrue(const Packet8h &a)
Definition: AVX/PacketMath.h:2263
EIGEN_DEVICE_FUNC Packet pmin(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:649
EIGEN_STRONG_INLINE Packet4f pdiv< Packet4f >(const Packet4f &a, const Packet4f &b)
Definition: AltiVec/PacketMath.h:1187
EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h &a, const Packet8h &b)
Definition: AVX/PacketMath.h:2323
EIGEN_STRONG_INLINE Packet2d pload< Packet2d >(const double *from)
Definition: LSX/PacketMath.h:1407
EIGEN_STRONG_INLINE Packet16b ploadu< Packet16b >(const bool *from)
Definition: SSE/PacketMath.h:1378
EIGEN_STRONG_INLINE Packet2d pmul< Packet2d >(const Packet2d &a, const Packet2d &b)
Definition: LSX/PacketMath.h:741
EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf &a)
Definition: AltiVec/Complex.h:264
EIGEN_STRONG_INLINE Packet4f pfrexp< Packet4f >(const Packet4f &a, Packet4f &exponent)
Definition: AltiVec/PacketMath.h:2328
EIGEN_STRONG_INLINE float predux_mul< Packet4f >(const Packet4f &a)
Definition: AltiVec/PacketMath.h:2522
EIGEN_STRONG_INLINE Packet2d pmin< PropagateNumbers, Packet2d >(const Packet2d &a, const Packet2d &b)
Definition: SSE/PacketMath.h:1141
EIGEN_STRONG_INLINE void prefetch< float >(const float *addr)
Definition: AltiVec/PacketMath.h:1854
EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i &a)
Definition: AltiVec/PacketMath.h:1975
EIGEN_STRONG_INLINE Packet4d pfrexp_generic_get_biased_exponent(const Packet4d &a)
Definition: AVX/PacketMath.h:1880
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter< int64_t, Packet2l >(int64_t *to, const Packet2l &from, Index stride)
Definition: LSX/PacketMath.h:1779
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter< double, Packet2d >(double *to, const Packet2d &from, Index stride)
Definition: LSX/PacketMath.h:1734
EIGEN_STRONG_INLINE Packet4ui pload< Packet4ui >(const uint32_t *from)
Definition: LSX/PacketMath.h:1435
EIGEN_STRONG_INLINE Packet4f pmax< PropagateNaN, Packet4f >(const Packet4f &a, const Packet4f &b)
Definition: LSX/PacketMath.h:2699
EIGEN_STRONG_INLINE Packet4i ploadu< Packet4i >(const int *from)
Definition: AltiVec/PacketMath.h:1537
EIGEN_STRONG_INLINE double predux_mul< Packet2d >(const Packet2d &a)
Definition: LSX/PacketMath.h:2019
EIGEN_STRONG_INLINE Packet2d pdiv< Packet2d >(const Packet2d &a, const Packet2d &b)
Definition: LSX/PacketMath.h:782
EIGEN_STRONG_INLINE Packet8bf psignbit(const Packet8bf &a)
Definition: AltiVec/PacketMath.h:1966
EIGEN_STRONG_INLINE double predux_min< Packet2d >(const Packet2d &a)
Definition: LSX/PacketMath.h:2073
EIGEN_STRONG_INLINE Packet4f pset1< Packet4f >(const float &from)
Definition: AltiVec/PacketMath.h:773
EIGEN_STRONG_INLINE Packet4ui pmax< Packet4ui >(const Packet4ui &a, const Packet4ui &b)
Definition: LSX/PacketMath.h:1229
EIGEN_STRONG_INLINE uint32_t predux_min< Packet4ui >(const Packet4ui &a)
Definition: LSX/PacketMath.h:2112
EIGEN_STRONG_INLINE Packet4i psub< Packet4i >(const Packet4i &a, const Packet4i &b)
Definition: AltiVec/PacketMath.h:1099
EIGEN_STRONG_INLINE Packet4f pmin< PropagateNumbers, Packet4f >(const Packet4f &a, const Packet4f &b)
Definition: SSE/PacketMath.h:1137
EIGEN_STRONG_INLINE Packet4ui plset< Packet4ui >(const uint32_t &a)
Definition: LSX/PacketMath.h:548
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter< float, Packet4f >(float *to, const Packet4f &from, Index stride)
Definition: AltiVec/PacketMath.h:954
EIGEN_STRONG_INLINE Packet2d plset< Packet2d >(const double &a)
Definition: LSX/PacketMath.h:563
EIGEN_STRONG_INLINE bool pfirst< Packet16b >(const Packet16b &a)
Definition: SSE/PacketMath.h:1629
EIGEN_STRONG_INLINE void pstoreu< bool >(bool *to, const Packet16b &from)
Definition: SSE/PacketMath.h:1495
EIGEN_STRONG_INLINE int64_t predux_mul< Packet2l >(const Packet2l &a)
Definition: LSX/PacketMath.h:2041
EIGEN_STRONG_INLINE Packet4f pceil< Packet4f >(const Packet4f &a)
Definition: AltiVec/PacketMath.h:1493
const char * SsePrefetchPtrType
Definition: SSE/PacketMath.h:1719
EIGEN_STRONG_INLINE void pstore< float >(float *to, const Packet4f &from)
Definition: AltiVec/PacketMath.h:642
EIGEN_STRONG_INLINE Packet2l pset1< Packet2l >(const int64_t &from)
Definition: LSX/PacketMath.h:478
EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f &a)
Definition: AltiVec/PacketMath.h:1936
EIGEN_STRONG_INLINE void pstores(Scalar *to, const Packet &from)
EIGEN_STRONG_INLINE Packet2l ploaddup< Packet2l >(const int64_t *from)
Definition: LSX/PacketMath.h:1509
EIGEN_STRONG_INLINE Packet4f ptrunc< Packet4f >(const Packet4f &a)
Definition: AltiVec/PacketMath.h:1501
EIGEN_STRONG_INLINE Packet8f peven_mask(const Packet8f &)
Definition: AVX/PacketMath.h:791
EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet8bf &a)
Definition: AltiVec/PacketMath.h:2418
EIGEN_STRONG_INLINE void pbroadcast4< Packet2d >(const double *a, Packet2d &a0, Packet2d &a1, Packet2d &a2, Packet2d &a3)
Definition: SSE/PacketMath.h:1819
EIGEN_STRONG_INLINE Packet4f pset1frombits< Packet4f >(unsigned int from)
Definition: AltiVec/PacketMath.h:803
EIGEN_DEVICE_FUNC void pstore(Scalar *to, const Packet &from)
Definition: GenericPacketMath.h:891
EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f &a, const Packet4f &b, const Packet4f &c)
Definition: LSX/PacketMath.h:835
EIGEN_STRONG_INLINE Packet4f pldexp< Packet4f >(const Packet4f &a, const Packet4f &exponent)
Definition: AltiVec/PacketMath.h:2319
EIGEN_STRONG_INLINE Packet2d ploadu< Packet2d >(const double *from)
Definition: LSX/PacketMath.h:1448
EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f &a, const Packet4f &b)
Definition: LSX/PacketMath.h:141
EIGEN_STRONG_INLINE Packet4f pxor< Packet4f >(const Packet4f &a, const Packet4f &b)
Definition: AltiVec/PacketMath.h:1448
EIGEN_STRONG_INLINE Packet4i pmin< Packet4i >(const Packet4i &a, const Packet4i &b)
Definition: AltiVec/PacketMath.h:1261
EIGEN_STRONG_INLINE Packet2l pmax< Packet2l >(const Packet2l &a, const Packet2l &b)
Definition: LSX/PacketMath.h:1217
EIGEN_STRONG_INLINE Packet2l psub< Packet2l >(const Packet2l &a, const Packet2l &b)
Definition: LSX/PacketMath.h:622
EIGEN_STRONG_INLINE Packet2d pfrexp< Packet2d >(const Packet2d &a, Packet2d &exponent)
Definition: LSX/PacketMath.h:2677
EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf &a, const Packet2cf &b)
Definition: AltiVec/Complex.h:353
EIGEN_STRONG_INLINE Packet4f padds< Packet4f >(const Packet4f &a, const Packet4f &b)
Definition: SSE/PacketMath.h:523
EIGEN_STRONG_INLINE Packet4f vec4f_swizzle1(const Packet4f &a, int p, int q, int r, int s)
Definition: LSX/PacketMath.h:126
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_generic(const Packet &a, const Packet &exponent)
Definition: GenericPacketMathFunctions.h:226
EIGEN_STRONG_INLINE Packet4f ptrue< Packet4f >(const Packet4f &a)
Definition: SSE/PacketMath.h:777
EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f &a, const Packet4f &b)
Definition: LSX/PacketMath.h:138
EIGEN_STRONG_INLINE Packet4f pmsub(const Packet4f &a, const Packet4f &b, const Packet4f &c)
Definition: LSX/PacketMath.h:819
EIGEN_STRONG_INLINE Packet2l pand< Packet2l >(const Packet2l &a, const Packet2l &b)
Definition: LSX/PacketMath.h:896
EIGEN_STRONG_INLINE void pstoreu< int >(int *to, const Packet4i &from)
Definition: AltiVec/PacketMath.h:1760
EIGEN_STRONG_INLINE Packet4ui pxor< Packet4ui >(const Packet4ui &a, const Packet4ui &b)
Definition: LSX/PacketMath.h:990
EIGEN_STRONG_INLINE Packet8h pand(const Packet8h &a, const Packet8h &b)
Definition: AVX/PacketMath.h:2319
EIGEN_STRONG_INLINE Packet4i ptrue< Packet4i >(const Packet4i &a)
Definition: SSE/PacketMath.h:769
EIGEN_STRONG_INLINE Packet2d ptrunc< Packet2d >(const Packet2d &a)
Definition: LSX/PacketMath.h:2749
EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h &a, const Packet8h &b)
Definition: AVX/PacketMath.h:2315
EIGEN_STRONG_INLINE int pfirst< Packet4i >(const Packet4i &a)
Definition: AltiVec/PacketMath.h:1869
EIGEN_STRONG_INLINE Packet4i plset< Packet4i >(const int &a)
Definition: AltiVec/PacketMath.h:1045
EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d &a, const Packet2d &b)
Definition: LSX/PacketMath.h:160
EIGEN_STRONG_INLINE float predux< Packet4f >(const Packet4f &a)
Definition: AltiVec/PacketMath.h:2435
EIGEN_STRONG_INLINE Packet2d ploads< Packet2d >(const double *from)
Definition: SSE/PacketMath.h:1403
EIGEN_STRONG_INLINE Packet2d pceil< Packet2d >(const Packet2d &a)
Definition: MSA/PacketMath.h:1186
EIGEN_STRONG_INLINE Packet2l pload< Packet2l >(const int64_t *from)
Definition: LSX/PacketMath.h:1423
EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
Definition: LSX/PacketMath.h:827
EIGEN_STRONG_INLINE Packet4f ploadu< Packet4f >(const float *from)
Definition: AltiVec/PacketMath.h:1533
EIGEN_STRONG_INLINE Packet4i pmul< Packet4i >(const Packet4i &a, const Packet4i &b)
Definition: AltiVec/PacketMath.h:1166
EIGEN_STRONG_INLINE Packet2d print< Packet2d >(const Packet2d &a)
Definition: LSX/PacketMath.h:2745
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pgather< uint32_t, Packet4ui >(const uint32_t *from, Index stride)
Definition: LSX/PacketMath.h:1710
EIGEN_STRONG_INLINE Packet4i pand< Packet4i >(const Packet4i &a, const Packet4i &b)
Definition: AltiVec/PacketMath.h:1410
EIGEN_STRONG_INLINE Packet2d pmin< Packet2d >(const Packet2d &a, const Packet2d &b)
Definition: LSX/PacketMath.h:1244
EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f &mask, const Packet4f &a, const Packet4f &b)
Definition: AltiVec/PacketMath.h:1474
EIGEN_STRONG_INLINE int predux< Packet4i >(const Packet4i &a)
Definition: AltiVec/PacketMath.h:2445
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic(const Packet &a, Packet &exponent)
Definition: GenericPacketMathFunctions.h:184
EIGEN_STRONG_INLINE Packet4f pand< Packet4f >(const Packet4f &a, const Packet4f &b)
Definition: AltiVec/PacketMath.h:1406
EIGEN_STRONG_INLINE Packet16b pxor< Packet16b >(const Packet16b &a, const Packet16b &b)
Definition: SSE/PacketMath.h:858
EIGEN_DEVICE_FUNC Packet psub(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:337
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4i pgather< int, Packet4i >(const int *from, Index stride)
Definition: AltiVec/PacketMath.h:858
EIGEN_STRONG_INLINE int predux_max< Packet4i >(const Packet4i &a)
Definition: AltiVec/PacketMath.h:2684
EIGEN_STRONG_INLINE Packet2l pmin< Packet2l >(const Packet2l &a, const Packet2l &b)
Definition: LSX/PacketMath.h:1184
EIGEN_STRONG_INLINE Packet2d ploadl< Packet2d >(const double *from)
Definition: SSE/PacketMath.h:1391
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pgather< double, Packet2d >(const double *from, Index stride)
Definition: LSX/PacketMath.h:1621
EIGEN_STRONG_INLINE Packet4i pcmp_lt< Packet4i >(const Packet4i &a, const Packet4i &b)
Definition: LSX/PacketMath.h:1097
EIGEN_STRONG_INLINE __m128i sse_blend_mask(const Selector< 2 > &ifPacket)
Definition: SSE/PacketMath.h:2186
EIGEN_STRONG_INLINE Packet4i pmax< Packet4i >(const Packet4i &a, const Packet4i &b)
Definition: AltiVec/PacketMath.h:1293
EIGEN_STRONG_INLINE void pbroadcast4< Packet4f >(const float *a, Packet4f &a0, Packet4f &a1, Packet4f &a2, Packet4f &a3)
Definition: AltiVec/PacketMath.h:823
EIGEN_STRONG_INLINE Packet2l pmul< Packet2l >(const Packet2l &a, const Packet2l &b)
Definition: LSX/PacketMath.h:757
EIGEN_STRONG_INLINE Packet2d pset1frombits< Packet2d >(uint64_t from)
Definition: LSX/PacketMath.h:513
EIGEN_STRONG_INLINE Packet4i pload< Packet4i >(const int *from)
Definition: AltiVec/PacketMath.h:497
__vector float Packet4f
Definition: AltiVec/PacketMath.h:33
EIGEN_STRONG_INLINE Packet2d psub< Packet2d >(const Packet2d &a, const Packet2d &b)
Definition: LSX/PacketMath.h:646
EIGEN_STRONG_INLINE Packet4f psub< Packet4f >(const Packet4f &a, const Packet4f &b)
Definition: AltiVec/PacketMath.h:1095
EIGEN_STRONG_INLINE Packet4f plset< Packet4f >(const float &a)
Definition: AltiVec/PacketMath.h:1041
EIGEN_STRONG_INLINE uint32_t predux_mul< Packet4ui >(const Packet4ui &a)
Definition: LSX/PacketMath.h:2058
EIGEN_STRONG_INLINE Packet16b padd< Packet16b >(const Packet16b &a, const Packet16b &b)
Definition: SSE/PacketMath.h:516
EIGEN_STRONG_INLINE void pstoreu< int64_t >(int64_t *to, const Packet8l &from)
Definition: AVX512/PacketMath.h:1123
EIGEN_STRONG_INLINE void pstoreu< float >(float *to, const Packet4f &from)
Definition: AltiVec/PacketMath.h:1756
EIGEN_STRONG_INLINE Packet2d pmax< Packet2d >(const Packet2d &a, const Packet2d &b)
Definition: LSX/PacketMath.h:1256
EIGEN_STRONG_INLINE Packet4f pround< Packet4f >(const Packet4f &a)
Definition: AltiVec/PacketMath.h:1479
EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f &a, const Packet4f &b)
Definition: AltiVec/PacketMath.h:1329
EIGEN_STRONG_INLINE Packet2l plset< Packet2l >(const int64_t &a)
Definition: LSX/PacketMath.h:533
EIGEN_STRONG_INLINE Packet4ui ploadu< Packet4ui >(const uint32_t *from)
Definition: LSX/PacketMath.h:1476
EIGEN_STRONG_INLINE Packet2d pround< Packet2d >(const Packet2d &a)
Definition: MSA/PacketMath.h:1206
EIGEN_STRONG_INLINE int64_t pfirst< Packet2l >(const Packet2l &a)
Definition: LSX/PacketMath.h:1898
EIGEN_STRONG_INLINE void pstore< uint32_t >(uint32_t *to, const Packet8ui &from)
Definition: AVX/PacketMath.h:1619
EIGEN_STRONG_INLINE void prefetch< int >(const int *addr)
Definition: AltiVec/PacketMath.h:1858
EIGEN_STRONG_INLINE Packet2d padds< Packet2d >(const Packet2d &a, const Packet2d &b)
Definition: SSE/PacketMath.h:527
EIGEN_STRONG_INLINE double pfirst< Packet2d >(const Packet2d &a)
Definition: LSX/PacketMath.h:1879
EIGEN_STRONG_INLINE Packet ploadl(const typename unpacket_traits< Packet >::type *from)
EIGEN_STRONG_INLINE bool predux_mul< Packet16b >(const Packet16b &a)
Definition: SSE/PacketMath.h:1941
EIGEN_STRONG_INLINE Packet2l ptrue< Packet2l >(const Packet2l &a)
Definition: SSE/PacketMath.h:765
EIGEN_STRONG_INLINE Packet2l pxor< Packet2l >(const Packet2l &a, const Packet2l &b)
Definition: LSX/PacketMath.h:978
EIGEN_STRONG_INLINE Packet4ui ploaddup< Packet4ui >(const uint32_t *from)
Definition: LSX/PacketMath.h:1523
EIGEN_STRONG_INLINE void pstore< int64_t >(int64_t *to, const Packet8l &from)
Definition: AVX512/PacketMath.h:1106
EIGEN_STRONG_INLINE Packet4f print< Packet4f >(const Packet4f &a)
Definition: LSX/PacketMath.h:2711
EIGEN_STRONG_INLINE Packet2d pfloor< Packet2d >(const Packet2d &a)
Definition: MSA/PacketMath.h:1167
EIGEN_STRONG_INLINE float predux_min< Packet4f >(const Packet4f &a)
Definition: AltiVec/PacketMath.h:2599
EIGEN_STRONG_INLINE void prefetch< double >(const double *addr)
Definition: AVX/PacketMath.h:1750
EIGEN_STRONG_INLINE Packet4f pmax< Packet4f >(const Packet4f &a, const Packet4f &b)
Definition: AltiVec/PacketMath.h:1282
std::int64_t int64_t
Definition: Meta.h:43
EIGEN_DEVICE_FUNC const Scalar & q
Definition: SpecialFunctionsImpl.h:2019
std::uint32_t uint32_t
Definition: Meta.h:40
std::uint64_t uint64_t
Definition: Meta.h:42
EIGEN_DEVICE_FUNC static constexpr EIGEN_ALWAYS_INLINE Scalar signbit(const Scalar &x)
Definition: MathFunctions.h:1419
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:83
T sign(T x)
Definition: cxx11_tensor_builtins_sycl.cpp:172
r
Definition: UniformPSDSelfTest.py:20
int c
Definition: calibrate.py:100
Definition: Eigen_Colamd.h:49
list x
Definition: plotDoE.py:28
numext::uint16_t x
Definition: Half.h:101
Definition: Half.h:139
Definition: GenericPacketMath.h:1407
Packet packet[N]
Definition: GenericPacketMath.h:1408
Definition: GenericPacketMath.h:1421
bool select[N]
Definition: GenericPacketMath.h:1422
Definition: GenericPacketMath.h:45
@ HasSign
Definition: GenericPacketMath.h:59
@ HasASin
Definition: GenericPacketMath.h:84
@ HasATanh
Definition: GenericPacketMath.h:87
@ HasRsqrt
Definition: GenericPacketMath.h:74
@ HasSin
Definition: GenericPacketMath.h:81
@ HasBlend
Definition: GenericPacketMath.h:66
@ HasErfc
Definition: GenericPacketMath.h:96
@ HasACos
Definition: GenericPacketMath.h:85
@ HasNdtri
Definition: GenericPacketMath.h:97
@ HasCos
Definition: GenericPacketMath.h:82
@ HasCmp
Definition: GenericPacketMath.h:69
@ HasReciprocal
Definition: GenericPacketMath.h:72
@ HasShift
Definition: GenericPacketMath.h:50
@ HasLog1p
Definition: GenericPacketMath.h:78
@ HasExp
Definition: GenericPacketMath.h:75
@ HasSqrt
Definition: GenericPacketMath.h:73
@ HasErf
Definition: GenericPacketMath.h:95
@ HasBessel
Definition: GenericPacketMath.h:98
@ HasExpm1
Definition: GenericPacketMath.h:76
@ HasLog
Definition: GenericPacketMath.h:77
@ HasTanh
Definition: GenericPacketMath.h:90
@ HasATan
Definition: GenericPacketMath.h:86
@ HasDiv
Definition: GenericPacketMath.h:71
Definition: GenericPacketMath.h:225
Definition: Meta.h:145
@ value
Definition: Meta.h:146
Packet16b half
Definition: SSE/PacketMath.h:280
Packet16b type
Definition: SSE/PacketMath.h:279
Packet2d half
Definition: SSE/PacketMath.h:208
Packet2d type
Definition: SSE/PacketMath.h:207
Packet4f type
Definition: SSE/PacketMath.h:174
Packet4f half
Definition: SSE/PacketMath.h:175
Packet2l half
Definition: SSE/PacketMath.h:264
Packet2l type
Definition: SSE/PacketMath.h:263
Packet4i type
Definition: SSE/PacketMath.h:232
Packet4i half
Definition: SSE/PacketMath.h:233
Packet4ui type
Definition: SSE/PacketMath.h:247
Packet4ui half
Definition: SSE/PacketMath.h:248
Definition: GenericPacketMath.h:108
T type
Definition: GenericPacketMath.h:109
@ size
Definition: GenericPacketMath.h:113
@ AlignedOnScalar
Definition: GenericPacketMath.h:114
@ Vectorizable
Definition: GenericPacketMath.h:112
T half
Definition: GenericPacketMath.h:110
@ HasSub
Definition: GenericPacketMath.h:118
@ HasMax
Definition: GenericPacketMath.h:124
@ HasNegate
Definition: GenericPacketMath.h:120
@ HasMul
Definition: GenericPacketMath.h:119
@ HasAdd
Definition: GenericPacketMath.h:117
@ HasSetLinear
Definition: GenericPacketMath.h:126
@ HasMin
Definition: GenericPacketMath.h:123
@ HasConj
Definition: GenericPacketMath.h:125
@ HasAbs2
Definition: GenericPacketMath.h:122
@ HasAbs
Definition: GenericPacketMath.h:121
@ value
Definition: XprHelper.h:884
Definition: SSE/PacketMath.h:90
@ mask
Definition: SSE/PacketMath.h:91
Packet16b half
Definition: SSE/PacketMath.h:364
bool type
Definition: SSE/PacketMath.h:363
double type
Definition: SSE/PacketMath.h:314
Packet2l integer_packet
Definition: SSE/PacketMath.h:316
Packet2d half
Definition: SSE/PacketMath.h:315
Packet2l half
Definition: SSE/PacketMath.h:328
int64_t type
Definition: SSE/PacketMath.h:327
Packet4i integer_packet
Definition: SSE/PacketMath.h:303
Packet4f half
Definition: SSE/PacketMath.h:302
float type
Definition: SSE/PacketMath.h:301
int type
Definition: SSE/PacketMath.h:339
Packet4i half
Definition: SSE/PacketMath.h:340
uint32_t type
Definition: SSE/PacketMath.h:351
Packet4ui half
Definition: SSE/PacketMath.h:352
Definition: GenericPacketMath.h:134
T type
Definition: GenericPacketMath.h:135
T half
Definition: GenericPacketMath.h:136
@ masked_load_available
Definition: GenericPacketMath.h:142
@ size
Definition: GenericPacketMath.h:139
@ masked_store_available
Definition: GenericPacketMath.h:143
@ vectorizable
Definition: GenericPacketMath.h:141
@ alignment
Definition: GenericPacketMath.h:140
std::ofstream out("Result.txt")
Definition: ZVector/PacketMath.h:50