AVX/PacketMath.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com)
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_PACKET_MATH_AVX_H
11 #define EIGEN_PACKET_MATH_AVX_H
12 
13 // IWYU pragma: private
14 #include "../../InternalHeaderCheck.h"
15 
16 namespace Eigen {
17 
18 namespace internal {
19 
20 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
21 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
22 #endif
23 
24 #if !defined(EIGEN_VECTORIZE_AVX512) && !defined(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS)
25 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
26 #endif
27 
28 #ifdef EIGEN_VECTORIZE_FMA
29 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
30 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
31 #endif
32 #endif
33 
34 typedef __m256 Packet8f;
36 typedef __m256d Packet4d;
37 #ifndef EIGEN_VECTORIZE_AVX512FP16
39 #endif
42 
43 #ifdef EIGEN_VECTORIZE_AVX2
44 // Start from 3 to be compatible with AVX512
45 typedef eigen_packet_wrapper<__m256i, 3> Packet4l;
46 typedef eigen_packet_wrapper<__m256i, 5> Packet4ul;
47 #endif
48 
49 template <>
50 struct is_arithmetic<__m256> {
51  enum { value = true };
52 };
53 template <>
54 struct is_arithmetic<__m256i> {
55  enum { value = true };
56 };
57 template <>
58 struct is_arithmetic<__m256d> {
59  enum { value = true };
60 };
61 template <>
63  enum { value = true };
64 };
65 // Note that `Packet8ui` uses the underlying type `__m256i`, which is
66 // interpreted as a vector of _signed_ `int32`s, which breaks some arithmetic
67 // operations used in `GenericPacketMath.h`.
68 template <>
70  enum { value = false };
71 };
72 #ifndef EIGEN_VECTORIZE_AVX512FP16
73 template <>
75  enum { value = true };
76 };
77 #endif
78 template <>
80  enum { value = true };
81 };
82 #ifdef EIGEN_VECTORIZE_AVX2
83 template <>
84 struct is_arithmetic<Packet4l> {
85  enum { value = true };
86 };
87 // Note that `Packet4ul` uses the underlying type `__m256i`, which is
88 // interpreted as a vector of _signed_ `int32`s, which breaks some arithmetic
89 // operations used in `GenericPacketMath.h`.
90 template <>
91 struct is_arithmetic<Packet4ul> {
92  enum { value = false };
93 };
94 #endif
95 
96 // Use the packet_traits defined in AVX512/PacketMath.h instead if we're going
97 // to leverage AVX512 instructions.
98 #ifndef EIGEN_VECTORIZE_AVX512
99 template <>
100 struct packet_traits<float> : default_packet_traits {
101  typedef Packet8f type;
102  typedef Packet4f half;
103  enum {
104  Vectorizable = 1,
105  AlignedOnScalar = 1,
106  size = 8,
107 
108  HasCmp = 1,
109  HasDiv = 1,
113  HasACos = 1,
114  HasASin = 1,
115  HasATan = 1,
116  HasATanh = 1,
117  HasLog = 1,
118  HasLog1p = 1,
119  HasExpm1 = 1,
120  HasExp = 1,
121  HasNdtri = 1,
123  HasSqrt = 1,
124  HasRsqrt = 1,
128  HasBlend = 1
129  };
130 };
131 template <>
133  typedef Packet4d type;
134  typedef Packet2d half;
135  enum {
138  size = 4,
139 
140  HasCmp = 1,
141  HasDiv = 1,
142 #ifdef EIGEN_VECTORIZE_AVX2
145 #endif
147  HasLog = 1,
148  HasErf = 1,
149  HasErfc = 1,
150  HasExp = 1,
151  HasSqrt = 1,
152  HasRsqrt = 1,
153  HasATan = 1,
154  HasATanh = 1,
155  HasBlend = 1
156  };
157 };
158 
159 template <>
161  typedef Packet8h type;
162  // There is no half-size packet for Packet8h.
163  typedef Packet8h half;
164  enum {
167  size = 8,
168 
169  HasCmp = 1,
170  HasAdd = 1,
171  HasSub = 1,
172  HasMul = 1,
173  HasDiv = 1,
177  HasAbs = 1,
178  HasAbs2 = 0,
179  HasMin = 1,
180  HasMax = 1,
181  HasConj = 1,
183  HasLog = 1,
184  HasLog1p = 1,
185  HasExpm1 = 1,
186  HasExp = 1,
187  HasSqrt = 1,
188  HasRsqrt = 1,
191  HasBlend = 0,
193  HasNdtri = 1
194  };
195 };
196 
197 template <>
198 struct packet_traits<bfloat16> : default_packet_traits {
199  typedef Packet8bf type;
200  // There is no half-size packet for current Packet8bf.
201  // TODO: support as SSE path.
202  typedef Packet8bf half;
203  enum {
204  Vectorizable = 1,
205  AlignedOnScalar = 1,
206  size = 8,
207 
208  HasCmp = 1,
209  HasAdd = 1,
210  HasSub = 1,
211  HasMul = 1,
212  HasDiv = 1,
215  HasNegate = 1,
216  HasAbs = 1,
217  HasAbs2 = 0,
218  HasMin = 1,
219  HasMax = 1,
220  HasConj = 1,
222  HasLog = 1,
223  HasLog1p = 1,
224  HasExpm1 = 1,
225  HasExp = 1,
226  HasSqrt = 1,
227  HasRsqrt = 1,
230  HasBlend = 0,
232  HasNdtri = 1
233  };
234 };
235 
236 template <>
237 struct packet_traits<int> : default_packet_traits {
238  typedef Packet8i type;
239  typedef Packet4i half;
240  enum { Vectorizable = 1, AlignedOnScalar = 1, HasCmp = 1, HasDiv = 1, size = 8 };
241 };
242 template <>
244  typedef Packet8ui type;
245  typedef Packet4ui half;
246  enum {
249  size = 8,
250 
251  HasDiv = 0,
253  HasSqrt = 0,
254 
255  HasCmp = 1,
256  HasMin = 1,
257  HasMax = 1,
258  HasShift = 1
259  };
260 };
261 
262 #ifdef EIGEN_VECTORIZE_AVX2
263 template <>
264 struct packet_traits<int64_t> : default_packet_traits {
265  typedef Packet4l type;
266  typedef Packet2l half;
267  enum { Vectorizable = 1, AlignedOnScalar = 1, HasCmp = 1, size = 4 };
268 };
269 template <>
270 struct packet_traits<uint64_t> : default_packet_traits {
271  typedef Packet4ul type;
272  // There is no half-size packet for current Packet4ul.
273  // TODO: support as SSE path.
274  typedef Packet4ul half;
275  enum {
276  Vectorizable = 1,
277  AlignedOnScalar = 1,
278  size = 4,
279 
280  // HasMin = 0,
281  // HasMax = 0,
282  HasDiv = 0,
283  HasBlend = 0,
284  HasTranspose = 0,
285  HasNegate = 0,
286  HasSqrt = 0,
287  HasCmp = 1,
288  HasShift = 1
289  };
290 };
291 #endif
292 
293 #endif
294 
295 template <>
296 struct scalar_div_cost<float, true> {
297  enum { value = 14 };
298 };
299 template <>
300 struct scalar_div_cost<double, true> {
301  enum { value = 16 };
302 };
303 
304 template <>
306  typedef float type;
307  typedef Packet4f half;
309  typedef uint8_t mask_t;
310  enum {
311  size = 8,
313  vectorizable = true,
316 #ifdef EIGEN_VECTORIZE_AVX512
317  ,
318  masked_fpops_available = true
319 #endif
320  };
321 };
322 template <>
324  typedef double type;
325  typedef Packet2d half;
326 #ifdef EIGEN_VECTORIZE_AVX2
327  typedef Packet4l integer_packet;
328 #endif
329  enum {
330  size = 4,
332  vectorizable = true,
334  masked_store_available = false
335  };
336 };
337 template <>
339  typedef int type;
340  typedef Packet4i half;
341  enum {
342  size = 8,
344  vectorizable = true,
346  masked_store_available = false
347  };
348 };
349 template <>
351  typedef uint32_t type;
352  typedef Packet4ui half;
353  enum {
354  size = 8,
356  vectorizable = true,
358  masked_store_available = false
359  };
360 };
361 #ifdef EIGEN_VECTORIZE_AVX2
362 template <>
363 struct unpacket_traits<Packet4l> {
364  typedef int64_t type;
365  typedef Packet2l half;
366  enum {
367  size = 4,
369  vectorizable = true,
370  masked_load_available = false,
371  masked_store_available = false
372  };
373 };
374 template <>
375 struct unpacket_traits<Packet4ul> {
376  typedef uint64_t type;
377  typedef Packet4ul half;
378  enum {
379  size = 4,
381  vectorizable = true,
382  masked_load_available = false,
383  masked_store_available = false
384  };
385 };
386 #endif
387 template <>
388 struct unpacket_traits<Packet8bf> {
389  typedef bfloat16 type;
390  typedef Packet8bf half;
391  enum {
392  size = 8,
394  vectorizable = true,
395  masked_load_available = false,
396  masked_store_available = false
397  };
398 };
399 
400 // Helper function for bit packing snippet of low precision comparison.
401 // It packs the flags from 16x16 to 8x16.
403  return _mm_packs_epi32(_mm256_extractf128_si256(_mm256_castps_si256(rf), 0),
404  _mm256_extractf128_si256(_mm256_castps_si256(rf), 1));
405 }
406 
407 #ifdef EIGEN_VECTORIZE_AVX2
408 template <>
409 EIGEN_STRONG_INLINE Packet4l pset1<Packet4l>(const int64_t& from) {
410  return _mm256_set1_epi64x(from);
411 }
412 template <>
413 EIGEN_STRONG_INLINE Packet4ul pset1<Packet4ul>(const uint64_t& from) {
414  return _mm256_set1_epi64x(numext::bit_cast<uint64_t>(from));
415 }
416 template <>
417 EIGEN_STRONG_INLINE Packet4l pzero(const Packet4l& /*a*/) {
418  return _mm256_setzero_si256();
419 }
420 template <>
421 EIGEN_STRONG_INLINE Packet4ul pzero(const Packet4ul& /*a*/) {
422  return _mm256_setzero_si256();
423 }
424 template <>
425 EIGEN_STRONG_INLINE Packet4l peven_mask(const Packet4l& /*a*/) {
426  return _mm256_set_epi64x(0ll, -1ll, 0ll, -1ll);
427 }
428 template <>
429 EIGEN_STRONG_INLINE Packet4ul peven_mask(const Packet4ul& /*a*/) {
430  return _mm256_set_epi64x(0ll, -1ll, 0ll, -1ll);
431 }
432 template <>
433 EIGEN_STRONG_INLINE Packet4l pload1<Packet4l>(const int64_t* from) {
434  return _mm256_set1_epi64x(*from);
435 }
436 template <>
437 EIGEN_STRONG_INLINE Packet4ul pload1<Packet4ul>(const uint64_t* from) {
438  return _mm256_set1_epi64x(*from);
439 }
440 template <>
441 EIGEN_STRONG_INLINE Packet4l padd<Packet4l>(const Packet4l& a, const Packet4l& b) {
442  return _mm256_add_epi64(a, b);
443 }
444 template <>
445 EIGEN_STRONG_INLINE Packet4ul padd<Packet4ul>(const Packet4ul& a, const Packet4ul& b) {
446  return _mm256_add_epi64(a, b);
447 }
448 template <>
449 EIGEN_STRONG_INLINE Packet4l plset<Packet4l>(const int64_t& a) {
450  return padd(pset1<Packet4l>(a), Packet4l(_mm256_set_epi64x(3ll, 2ll, 1ll, 0ll)));
451 }
452 template <>
453 EIGEN_STRONG_INLINE Packet4ul plset<Packet4ul>(const uint64_t& a) {
454  return padd(pset1<Packet4ul>(a), Packet4ul(_mm256_set_epi64x(3ll, 2ll, 1ll, 0ll)));
455 }
456 template <>
457 EIGEN_STRONG_INLINE Packet4l psub<Packet4l>(const Packet4l& a, const Packet4l& b) {
458  return _mm256_sub_epi64(a, b);
459 }
460 template <>
461 EIGEN_STRONG_INLINE Packet4ul psub<Packet4ul>(const Packet4ul& a, const Packet4ul& b) {
462  return _mm256_sub_epi64(a, b);
463 }
464 template <>
465 EIGEN_STRONG_INLINE Packet4l pnegate(const Packet4l& a) {
466  return psub(pzero(a), a);
467 }
468 template <>
469 EIGEN_STRONG_INLINE Packet4l pconj(const Packet4l& a) {
470  return a;
471 }
472 template <>
473 EIGEN_STRONG_INLINE Packet4l pcmp_le(const Packet4l& a, const Packet4l& b) {
474  return _mm256_xor_si256(_mm256_cmpgt_epi64(a, b), _mm256_set1_epi32(-1));
475 }
476 template <>
477 EIGEN_STRONG_INLINE Packet4ul pcmp_le(const Packet4ul& a, const Packet4ul& b) {
478  return (Packet4ul)pcmp_le((Packet4l)psub(a, pset1<Packet4ul>(0x8000000000000000UL)),
479  (Packet4l)psub(b, pset1<Packet4ul>(0x8000000000000000UL)));
480 }
481 template <>
482 EIGEN_STRONG_INLINE Packet4l pcmp_lt(const Packet4l& a, const Packet4l& b) {
483  return _mm256_cmpgt_epi64(b, a);
484 }
485 template <>
486 EIGEN_STRONG_INLINE Packet4ul pcmp_lt(const Packet4ul& a, const Packet4ul& b) {
487  return (Packet4ul)pcmp_lt((Packet4l)psub(a, pset1<Packet4ul>(0x8000000000000000UL)),
488  (Packet4l)psub(b, pset1<Packet4ul>(0x8000000000000000UL)));
489 }
490 template <>
491 EIGEN_STRONG_INLINE Packet4l pcmp_eq(const Packet4l& a, const Packet4l& b) {
492  return _mm256_cmpeq_epi64(a, b);
493 }
494 template <>
495 EIGEN_STRONG_INLINE Packet4ul pcmp_eq(const Packet4ul& a, const Packet4ul& b) {
496  return _mm256_cmpeq_epi64(a, b);
497 }
498 template <>
499 EIGEN_STRONG_INLINE Packet4l ptrue<Packet4l>(const Packet4l& a) {
500  return _mm256_cmpeq_epi64(a, a);
501 }
502 template <>
503 EIGEN_STRONG_INLINE Packet4ul ptrue<Packet4ul>(const Packet4ul& a) {
504  return _mm256_cmpeq_epi64(a, a);
505 }
506 template <>
507 EIGEN_STRONG_INLINE Packet4l pand<Packet4l>(const Packet4l& a, const Packet4l& b) {
508  return _mm256_and_si256(a, b);
509 }
510 template <>
511 EIGEN_STRONG_INLINE Packet4l por<Packet4l>(const Packet4l& a, const Packet4l& b) {
512  return _mm256_or_si256(a, b);
513 }
514 template <>
515 EIGEN_STRONG_INLINE Packet4l pxor<Packet4l>(const Packet4l& a, const Packet4l& b) {
516  return _mm256_xor_si256(a, b);
517 }
518 template <>
519 EIGEN_STRONG_INLINE Packet4ul pxor<Packet4ul>(const Packet4ul& a, const Packet4ul& b) {
520  return _mm256_xor_si256(a, b);
521 }
522 template <>
523 EIGEN_STRONG_INLINE Packet4l pandnot<Packet4l>(const Packet4l& a, const Packet4l& b) {
524  return _mm256_andnot_si256(b, a);
525 }
526 template <int N>
527 EIGEN_STRONG_INLINE Packet4l plogical_shift_right(Packet4l a) {
528  return _mm256_srli_epi64(a, N);
529 }
530 template <int N>
531 EIGEN_STRONG_INLINE Packet4l plogical_shift_left(Packet4l a) {
532  return _mm256_slli_epi64(a, N);
533 }
534 #ifdef EIGEN_VECTORIZE_AVX512FP16
535 template <int N>
536 EIGEN_STRONG_INLINE Packet4l parithmetic_shift_right(Packet4l a) {
537  return _mm256_srai_epi64(a, N);
538 }
539 #else
540 template <int N>
541 EIGEN_STRONG_INLINE std::enable_if_t<(N == 0), Packet4l> parithmetic_shift_right(Packet4l a) {
542  return a;
543 }
544 template <int N>
545 EIGEN_STRONG_INLINE std::enable_if_t<(N > 0) && (N < 32), Packet4l> parithmetic_shift_right(Packet4l a) {
546  __m256i hi_word = _mm256_srai_epi32(a, N);
547  __m256i lo_word = _mm256_srli_epi64(a, N);
548  return _mm256_blend_epi32(hi_word, lo_word, 0b01010101);
549 }
550 template <int N>
551 EIGEN_STRONG_INLINE std::enable_if_t<(N >= 32) && (N < 63), Packet4l> parithmetic_shift_right(Packet4l a) {
552  __m256i hi_word = _mm256_srai_epi32(a, 31);
553  __m256i lo_word = _mm256_shuffle_epi32(_mm256_srai_epi32(a, N - 32), (shuffle_mask<1, 1, 3, 3>::mask));
554  return _mm256_blend_epi32(hi_word, lo_word, 0b01010101);
555 }
556 template <int N>
557 EIGEN_STRONG_INLINE std::enable_if_t<(N == 63), Packet4l> parithmetic_shift_right(Packet4l a) {
558  return _mm256_cmpgt_epi64(_mm256_setzero_si256(), a);
559 }
560 template <int N>
561 EIGEN_STRONG_INLINE std::enable_if_t<(N < 0) || (N > 63), Packet4l> parithmetic_shift_right(Packet4l a) {
562  return parithmetic_shift_right<int(N & 63)>(a);
563 }
564 #endif
565 template <>
566 EIGEN_STRONG_INLINE Packet4l pload<Packet4l>(const int64_t* from) {
567  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
568 }
569 template <>
570 EIGEN_STRONG_INLINE Packet4ul pload<Packet4ul>(const uint64_t* from) {
571  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
572 }
573 template <>
574 EIGEN_STRONG_INLINE Packet4l ploadu<Packet4l>(const int64_t* from) {
575  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
576 }
577 template <>
578 EIGEN_STRONG_INLINE Packet4ul ploadu<Packet4ul>(const uint64_t* from) {
579  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
580 }
581 // Loads 2 int64_ts from memory a returns the packet {a0, a0, a1, a1}
582 template <>
583 EIGEN_STRONG_INLINE Packet4l ploaddup<Packet4l>(const int64_t* from) {
584  const Packet4l a = _mm256_castsi128_si256(_mm_loadu_si128(reinterpret_cast<const __m128i*>(from)));
585  return _mm256_permutevar8x32_epi32(a, _mm256_setr_epi32(0, 1, 0, 1, 2, 3, 2, 3));
586 }
587 // Loads 2 uint64_ts from memory a returns the packet {a0, a0, a1, a1}
588 template <>
589 EIGEN_STRONG_INLINE Packet4ul ploaddup<Packet4ul>(const uint64_t* from) {
590  const Packet4ul a = _mm256_castsi128_si256(_mm_loadu_si128(reinterpret_cast<const __m128i*>(from)));
591  return _mm256_permutevar8x32_epi32(a, _mm256_setr_epi32(0, 1, 0, 1, 2, 3, 2, 3));
592 }
593 template <>
594 EIGEN_STRONG_INLINE void pstore<int64_t>(int64_t* to, const Packet4l& from) {
595  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), from);
596 }
597 template <>
598 EIGEN_STRONG_INLINE void pstore<uint64_t>(uint64_t* to, const Packet4ul& from) {
599  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), from);
600 }
601 template <>
602 EIGEN_STRONG_INLINE void pstoreu<int64_t>(int64_t* to, const Packet4l& from) {
603  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from);
604 }
605 template <>
606 EIGEN_STRONG_INLINE void pstoreu<uint64_t>(uint64_t* to, const Packet4ul& from) {
607  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from);
608 }
609 template <>
610 EIGEN_DEVICE_FUNC inline Packet4l pgather<int64_t, Packet4l>(const int64_t* from, Index stride) {
611  return _mm256_set_epi64x(from[3 * stride], from[2 * stride], from[1 * stride], from[0 * stride]);
612 }
613 template <>
614 EIGEN_DEVICE_FUNC inline Packet4ul pgather<uint64_t, Packet4ul>(const uint64_t* from, Index stride) {
615  return _mm256_set_epi64x(from[3 * stride], from[2 * stride], from[1 * stride], from[0 * stride]);
616 }
617 template <>
618 EIGEN_DEVICE_FUNC inline void pscatter<int64_t, Packet4l>(int64_t* to, const Packet4l& from, Index stride) {
619  __m128i low = _mm256_extractf128_si256(from, 0);
620  to[stride * 0] = _mm_extract_epi64_0(low);
621  to[stride * 1] = _mm_extract_epi64_1(low);
622 
623  __m128i high = _mm256_extractf128_si256(from, 1);
624  to[stride * 2] = _mm_extract_epi64_0(high);
625  to[stride * 3] = _mm_extract_epi64_1(high);
626 }
627 template <>
628 EIGEN_DEVICE_FUNC inline void pscatter<uint64_t, Packet4ul>(uint64_t* to, const Packet4ul& from, Index stride) {
629  __m128i low = _mm256_extractf128_si256(from, 0);
630  to[stride * 0] = _mm_extract_epi64_0(low);
631  to[stride * 1] = _mm_extract_epi64_1(low);
632 
633  __m128i high = _mm256_extractf128_si256(from, 1);
634  to[stride * 2] = _mm_extract_epi64_0(high);
635  to[stride * 3] = _mm_extract_epi64_1(high);
636 }
637 template <>
638 EIGEN_STRONG_INLINE void pstore1<Packet4l>(int64_t* to, const int64_t& a) {
639  Packet4l pa = pset1<Packet4l>(a);
640  pstore(to, pa);
641 }
642 template <>
643 EIGEN_STRONG_INLINE void pstore1<Packet4ul>(uint64_t* to, const uint64_t& a) {
644  Packet4ul pa = pset1<Packet4ul>(a);
645  pstore(to, pa);
646 }
647 template <>
648 EIGEN_STRONG_INLINE int64_t pfirst<Packet4l>(const Packet4l& a) {
649  return _mm_extract_epi64_0(_mm256_castsi256_si128(a));
650 }
651 template <>
652 EIGEN_STRONG_INLINE uint64_t pfirst<Packet4ul>(const Packet4ul& a) {
653  return _mm_extract_epi64_0(_mm256_castsi256_si128(a));
654 }
655 template <>
656 EIGEN_STRONG_INLINE int64_t predux<Packet4l>(const Packet4l& a) {
657  __m128i r = _mm_add_epi64(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
659 }
660 template <>
661 EIGEN_STRONG_INLINE uint64_t predux<Packet4ul>(const Packet4ul& a) {
662  __m128i r = _mm_add_epi64(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
663  return numext::bit_cast<uint64_t>(_mm_extract_epi64_0(r) + _mm_extract_epi64_1(r));
664 }
665 
666 template <>
667 EIGEN_STRONG_INLINE bool predux_any(const Packet4l& a) {
668  return _mm256_movemask_pd(_mm256_castsi256_pd(a)) != 0;
669 }
670 template <>
671 EIGEN_STRONG_INLINE bool predux_any(const Packet4ul& a) {
672  return _mm256_movemask_pd(_mm256_castsi256_pd(a)) != 0;
673 }
674 
675 #define MM256_SHUFFLE_EPI64(A, B, M) _mm256_shuffle_pd(_mm256_castsi256_pd(A), _mm256_castsi256_pd(B), M)
676 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4l, 4>& kernel) {
677  __m256d T0 = MM256_SHUFFLE_EPI64(kernel.packet[0], kernel.packet[1], 15);
678  __m256d T1 = MM256_SHUFFLE_EPI64(kernel.packet[0], kernel.packet[1], 0);
679  __m256d T2 = MM256_SHUFFLE_EPI64(kernel.packet[2], kernel.packet[3], 15);
680  __m256d T3 = MM256_SHUFFLE_EPI64(kernel.packet[2], kernel.packet[3], 0);
681 
682  kernel.packet[1] = _mm256_castpd_si256(_mm256_permute2f128_pd(T0, T2, 32));
683  kernel.packet[3] = _mm256_castpd_si256(_mm256_permute2f128_pd(T0, T2, 49));
684  kernel.packet[0] = _mm256_castpd_si256(_mm256_permute2f128_pd(T1, T3, 32));
685  kernel.packet[2] = _mm256_castpd_si256(_mm256_permute2f128_pd(T1, T3, 49));
686 }
687 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4ul, 4>& kernel) {
688  ptranspose((PacketBlock<Packet4l, 4>&)kernel);
689 }
690 template <>
691 EIGEN_STRONG_INLINE Packet4l pmin<Packet4l>(const Packet4l& a, const Packet4l& b) {
692  __m256i cmp = _mm256_cmpgt_epi64(a, b);
693  __m256i a_min = _mm256_andnot_si256(cmp, a);
694  __m256i b_min = _mm256_and_si256(cmp, b);
695  return Packet4l(_mm256_or_si256(a_min, b_min));
696 }
697 template <>
698 EIGEN_STRONG_INLINE Packet4ul pmin<Packet4ul>(const Packet4ul& a, const Packet4ul& b) {
699  return padd((Packet4ul)pmin((Packet4l)psub(a, pset1<Packet4ul>(0x8000000000000000UL)),
700  (Packet4l)psub(b, pset1<Packet4ul>(0x8000000000000000UL))),
701  pset1<Packet4ul>(0x8000000000000000UL));
702 }
703 template <>
704 EIGEN_STRONG_INLINE Packet4l pmax<Packet4l>(const Packet4l& a, const Packet4l& b) {
705  __m256i cmp = _mm256_cmpgt_epi64(a, b);
706  __m256i a_min = _mm256_and_si256(cmp, a);
707  __m256i b_min = _mm256_andnot_si256(cmp, b);
708  return Packet4l(_mm256_or_si256(a_min, b_min));
709 }
710 template <>
711 EIGEN_STRONG_INLINE Packet4ul pmax<Packet4ul>(const Packet4ul& a, const Packet4ul& b) {
712  return padd((Packet4ul)pmax((Packet4l)psub(a, pset1<Packet4ul>(0x8000000000000000UL)),
713  (Packet4l)psub(b, pset1<Packet4ul>(0x8000000000000000UL))),
714  pset1<Packet4ul>(0x8000000000000000UL));
715 }
716 template <>
717 EIGEN_STRONG_INLINE Packet4l pabs<Packet4l>(const Packet4l& a) {
718  Packet4l pz = pzero<Packet4l>(a);
719  Packet4l cmp = _mm256_cmpgt_epi64(a, pz);
720  return psub(cmp, pxor(a, cmp));
721 }
722 template <>
723 EIGEN_STRONG_INLINE Packet4ul pabs<Packet4ul>(const Packet4ul& a) {
724  return a;
725 }
726 template <>
727 EIGEN_STRONG_INLINE Packet4l pmul<Packet4l>(const Packet4l& a, const Packet4l& b) {
728  // 64-bit mul requires avx512, so do this with 32-bit multiplication
729  __m256i upper32_a = _mm256_srli_epi64(a, 32);
730  __m256i upper32_b = _mm256_srli_epi64(b, 32);
731 
732  // upper * lower
733  __m256i mul1 = _mm256_mul_epu32(upper32_a, b);
734  __m256i mul2 = _mm256_mul_epu32(upper32_b, a);
735  // Gives us both upper*upper and lower*lower
736  __m256i mul3 = _mm256_mul_epu32(a, b);
737 
738  __m256i high = _mm256_slli_epi64(_mm256_add_epi64(mul1, mul2), 32);
739  return _mm256_add_epi64(high, mul3);
740 }
741 template <>
742 EIGEN_STRONG_INLINE Packet4ul pmul<Packet4ul>(const Packet4ul& a, const Packet4ul& b) {
743  return (Packet4ul)pmul<Packet4l>((Packet4l)a, (Packet4l)b);
744 }
745 #endif
746 
747 template <>
749  return _mm256_set1_ps(from);
750 }
751 template <>
753  return _mm256_set1_pd(from);
754 }
755 template <>
757  return _mm256_set1_epi32(from);
758 }
759 template <>
761  return _mm256_set1_epi32(from);
762 }
763 
764 template <>
766  return _mm256_castsi256_ps(pset1<Packet8i>(from));
767 }
768 template <>
770  return _mm256_castsi256_pd(_mm256_set1_epi64x(from));
771 }
772 
773 template <>
775  return _mm256_setzero_ps();
776 }
777 template <>
779  return _mm256_setzero_pd();
780 }
781 template <>
783  return _mm256_setzero_si256();
784 }
785 template <>
787  return _mm256_setzero_si256();
788 }
789 
790 template <>
792  return _mm256_castsi256_ps(_mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1));
793 }
794 template <>
796  return _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
797 }
798 template <>
800  return _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
801 }
802 template <>
804  return _mm256_castsi256_pd(_mm256_set_epi32(0, 0, -1, -1, 0, 0, -1, -1));
805 }
806 
807 template <>
809  return _mm256_broadcast_ss(from);
810 }
811 template <>
813  return _mm256_broadcast_sd(from);
814 }
815 
816 template <>
818  return _mm256_add_ps(a, b);
819 }
820 #ifdef EIGEN_VECTORIZE_AVX512
821 template <>
823  __mmask16 mask = static_cast<__mmask16>(umask & 0x00FF);
824  return _mm512_castps512_ps256(_mm512_maskz_add_ps(mask, _mm512_castps256_ps512(a), _mm512_castps256_ps512(b)));
825 }
826 #endif
827 template <>
829  return _mm256_add_pd(a, b);
830 }
831 template <>
833 #ifdef EIGEN_VECTORIZE_AVX2
834  return _mm256_add_epi32(a, b);
835 #else
836  __m128i lo = _mm_add_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
837  __m128i hi = _mm_add_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
838  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
839 #endif
840 }
841 template <>
843 #ifdef EIGEN_VECTORIZE_AVX2
844  return _mm256_add_epi32(a, b);
845 #else
846  __m128i lo = _mm_add_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
847  __m128i hi = _mm_add_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
848  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
849 #endif
850 }
851 
852 template <>
854  return padd(pset1<Packet8f>(a), _mm256_set_ps(7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0));
855 }
856 template <>
858  return padd(pset1<Packet4d>(a), _mm256_set_pd(3.0, 2.0, 1.0, 0.0));
859 }
860 template <>
862  return padd(pset1<Packet8i>(a), (Packet8i)_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0));
863 }
864 template <>
866  return padd(pset1<Packet8ui>(a), (Packet8ui)_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0));
867 }
868 
869 template <>
871  return _mm256_sub_ps(a, b);
872 }
873 template <>
875  return _mm256_sub_pd(a, b);
876 }
877 template <>
879 #ifdef EIGEN_VECTORIZE_AVX2
880  return _mm256_sub_epi32(a, b);
881 #else
882  __m128i lo = _mm_sub_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
883  __m128i hi = _mm_sub_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
884  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
885 #endif
886 }
887 template <>
889 #ifdef EIGEN_VECTORIZE_AVX2
890  return _mm256_sub_epi32(a, b);
891 #else
892  __m128i lo = _mm_sub_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
893  __m128i hi = _mm_sub_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
894  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
895 #endif
896 }
897 
898 template <>
900  const Packet8f mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
901  return _mm256_xor_ps(a, mask);
902 }
903 template <>
905  const Packet4d mask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x8000000000000000ULL));
906  return _mm256_xor_pd(a, mask);
907 }
908 template <>
910  return psub(pzero(a), a);
911 }
912 
913 template <>
915  return a;
916 }
917 template <>
919  return a;
920 }
921 template <>
923  return a;
924 }
925 
926 template <>
928  return _mm256_mul_ps(a, b);
929 }
930 template <>
932  return _mm256_mul_pd(a, b);
933 }
934 template <>
936 #ifdef EIGEN_VECTORIZE_AVX2
937  return _mm256_mullo_epi32(a, b);
938 #else
939  const __m128i lo = _mm_mullo_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
940  const __m128i hi = _mm_mullo_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
941  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
942 #endif
943 }
944 template <>
946 #ifdef EIGEN_VECTORIZE_AVX2
947  return _mm256_mullo_epi32(a, b);
948 #else
949  const __m128i lo = _mm_mullo_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
950  const __m128i hi = _mm_mullo_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
951  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
952 #endif
953 }
954 
955 template <>
957  return _mm256_div_ps(a, b);
958 }
959 template <>
961  return _mm256_div_pd(a, b);
962 }
963 
964 template <>
966 #ifdef EIGEN_VECTORIZE_AVX512
967  return _mm512_cvttpd_epi32(_mm512_div_pd(_mm512_cvtepi32_pd(a), _mm512_cvtepi32_pd(b)));
968 #else
969  Packet4i lo = pdiv<Packet4i>(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
970  Packet4i hi = pdiv<Packet4i>(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
971  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
972 #endif
973 }
974 
975 #ifdef EIGEN_VECTORIZE_FMA
976 template <>
977 EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
978  return _mm256_fmadd_ps(a, b, c);
979 }
980 template <>
981 EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) {
982  return _mm256_fmadd_pd(a, b, c);
983 }
984 
985 template <>
986 EIGEN_STRONG_INLINE Packet8f pmsub(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
987  return _mm256_fmsub_ps(a, b, c);
988 }
989 
990 template <>
991 EIGEN_STRONG_INLINE Packet4d pmsub(const Packet4d& a, const Packet4d& b, const Packet4d& c) {
992  return _mm256_fmsub_pd(a, b, c);
993 }
994 
995 template <>
996 EIGEN_STRONG_INLINE Packet8f pnmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
997  return _mm256_fnmadd_ps(a, b, c);
998 }
999 
1000 template <>
1001 EIGEN_STRONG_INLINE Packet4d pnmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) {
1002  return _mm256_fnmadd_pd(a, b, c);
1003 }
1004 
1005 template <>
1006 EIGEN_STRONG_INLINE Packet8f pnmsub(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
1007  return _mm256_fnmsub_ps(a, b, c);
1008 }
1009 
1010 template <>
1011 EIGEN_STRONG_INLINE Packet4d pnmsub(const Packet4d& a, const Packet4d& b, const Packet4d& c) {
1012  return _mm256_fnmsub_pd(a, b, c);
1013 }
1014 
1015 #endif
1016 
1017 template <>
1019  return _mm256_cmp_ps(a, b, _CMP_LE_OQ);
1020 }
1021 template <>
1023  return _mm256_cmp_ps(a, b, _CMP_LT_OQ);
1024 }
1025 template <>
1027  return _mm256_cmp_ps(a, b, _CMP_NGE_UQ);
1028 }
1029 template <>
1031  return _mm256_cmp_ps(a, b, _CMP_EQ_OQ);
1032 }
1033 template <>
1035  return _mm256_cmp_ps(a, a, _CMP_UNORD_Q);
1036 }
1037 
1038 template <>
1040  return _mm256_cmp_pd(a, b, _CMP_LE_OQ);
1041 }
1042 template <>
1044  return _mm256_cmp_pd(a, b, _CMP_LT_OQ);
1045 }
1046 template <>
1048  return _mm256_cmp_pd(a, b, _CMP_NGE_UQ);
1049 }
1050 template <>
1052  return _mm256_cmp_pd(a, b, _CMP_EQ_OQ);
1053 }
1054 
1055 template <>
1057 #ifdef EIGEN_VECTORIZE_AVX2
1058  return _mm256_xor_si256(_mm256_cmpgt_epi32(a, b), _mm256_set1_epi32(-1));
1059 #else
1060  __m128i lo = _mm_cmpgt_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
1061  lo = _mm_xor_si128(lo, _mm_set1_epi32(-1));
1062  __m128i hi = _mm_cmpgt_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
1063  hi = _mm_xor_si128(hi, _mm_set1_epi32(-1));
1064  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
1065 #endif
1066 }
1067 template <>
1069 #ifdef EIGEN_VECTORIZE_AVX2
1070  return _mm256_cmpgt_epi32(b, a);
1071 #else
1072  __m128i lo = _mm_cmpgt_epi32(_mm256_extractf128_si256(b, 0), _mm256_extractf128_si256(a, 0));
1073  __m128i hi = _mm_cmpgt_epi32(_mm256_extractf128_si256(b, 1), _mm256_extractf128_si256(a, 1));
1074  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
1075 #endif
1076 }
1077 template <>
1079 #ifdef EIGEN_VECTORIZE_AVX2
1080  return _mm256_cmpeq_epi32(a, b);
1081 #else
1082  __m128i lo = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
1083  __m128i hi = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
1084  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
1085 #endif
1086 }
1087 template <>
1089 #ifdef EIGEN_VECTORIZE_AVX2
1090  return _mm256_cmpeq_epi32(a, b);
1091 #else
1092  __m128i lo = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
1093  __m128i hi = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
1094  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
1095 #endif
1096 }
1097 
1098 template <>
1100 #if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
1101  // There appears to be a bug in GCC, by which the optimizer may flip
1102  // the argument order in calls to _mm_min_ps/_mm_max_ps, so we have to
1103  // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
1104  // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
1105  Packet8f res;
1106  asm("vminps %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
1107  return res;
1108 #else
1109  // Arguments are swapped to match NaN propagation behavior of std::min.
1110  return _mm256_min_ps(b, a);
1111 #endif
1112 }
1113 template <>
1115 #if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
1116  // See pmin above
1117  Packet4d res;
1118  asm("vminpd %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
1119  return res;
1120 #else
1121  // Arguments are swapped to match NaN propagation behavior of std::min.
1122  return _mm256_min_pd(b, a);
1123 #endif
1124 }
1125 template <>
1127 #ifdef EIGEN_VECTORIZE_AVX2
1128  return _mm256_min_epi32(a, b);
1129 #else
1130  __m128i lo = _mm_min_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
1131  __m128i hi = _mm_min_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
1132  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
1133 #endif
1134 }
1135 template <>
1137 #ifdef EIGEN_VECTORIZE_AVX2
1138  return _mm256_min_epu32(a, b);
1139 #else
1140  __m128i lo = _mm_min_epu32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
1141  __m128i hi = _mm_min_epu32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
1142  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
1143 #endif
1144 }
1145 
1146 template <>
1148 #if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
1149  // See pmin above
1150  Packet8f res;
1151  asm("vmaxps %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
1152  return res;
1153 #else
1154  // Arguments are swapped to match NaN propagation behavior of std::max.
1155  return _mm256_max_ps(b, a);
1156 #endif
1157 }
1158 template <>
1160 #if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
1161  // See pmin above
1162  Packet4d res;
1163  asm("vmaxpd %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
1164  return res;
1165 #else
1166  // Arguments are swapped to match NaN propagation behavior of std::max.
1167  return _mm256_max_pd(b, a);
1168 #endif
1169 }
1170 template <>
1172 #ifdef EIGEN_VECTORIZE_AVX2
1173  return _mm256_max_epi32(a, b);
1174 #else
1175  __m128i lo = _mm_max_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
1176  __m128i hi = _mm_max_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
1177  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
1178 #endif
1179 }
1180 template <>
1182 #ifdef EIGEN_VECTORIZE_AVX2
1183  return _mm256_max_epu32(a, b);
1184 #else
1185  __m128i lo = _mm_max_epu32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
1186  __m128i hi = _mm_max_epu32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
1187  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
1188 #endif
1189 }
1190 
1191 #ifdef EIGEN_VECTORIZE_AVX2
1192 template <>
1194  return _mm256_sign_epi32(_mm256_set1_epi32(1), a);
1195 }
1196 #endif
1197 
1198 // Add specializations for min/max with prescribed NaN propagation.
1199 template <>
1202 }
1203 template <>
1206 }
1207 template <>
1210 }
1211 template <>
1214 }
1215 template <>
1218 }
1219 template <>
1222 }
1223 template <>
1226 }
1227 template <>
1230 }
1231 
1232 template <>
1234  return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION);
1235 }
1236 template <>
1238  return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION);
1239 }
1240 
1241 template <>
1243  return _mm256_ceil_ps(a);
1244 }
1245 template <>
1247  return _mm256_ceil_pd(a);
1248 }
1249 
1250 template <>
1252  return _mm256_floor_ps(a);
1253 }
1254 template <>
1256  return _mm256_floor_pd(a);
1257 }
1258 
1259 template <>
1261  return _mm256_round_ps(a, _MM_FROUND_TRUNC);
1262 }
1263 template <>
1265  return _mm256_round_pd(a, _MM_FROUND_TRUNC);
1266 }
1267 
1268 template <>
1270 #ifdef EIGEN_VECTORIZE_AVX2
1271  // vpcmpeqd has lower latency than the more general vcmpps
1272  return _mm256_cmpeq_epi32(a, a);
1273 #else
1274  const __m256 b = _mm256_castsi256_ps(a);
1275  return _mm256_castps_si256(_mm256_cmp_ps(b, b, _CMP_TRUE_UQ));
1276 #endif
1277 }
1278 
1279 template <>
1281 #ifdef EIGEN_VECTORIZE_AVX2
1282  // vpcmpeqd has lower latency than the more general vcmpps
1283  const __m256i b = _mm256_castps_si256(a);
1284  return _mm256_castsi256_ps(_mm256_cmpeq_epi32(b, b));
1285 #else
1286  return _mm256_cmp_ps(a, a, _CMP_TRUE_UQ);
1287 #endif
1288 }
1289 
1290 template <>
1292 #ifdef EIGEN_VECTORIZE_AVX2
1293  // vpcmpeqq has lower latency than the more general vcmppd
1294  const __m256i b = _mm256_castpd_si256(a);
1295  return _mm256_castsi256_pd(_mm256_cmpeq_epi64(b, b));
1296 #else
1297  return _mm256_cmp_pd(a, a, _CMP_TRUE_UQ);
1298 #endif
1299 }
1300 
1301 template <>
1303  return _mm256_and_ps(a, b);
1304 }
1305 template <>
1307  return _mm256_and_pd(a, b);
1308 }
1309 template <>
1311 #ifdef EIGEN_VECTORIZE_AVX2
1312  return _mm256_and_si256(a, b);
1313 #else
1314  return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
1315 #endif
1316 }
1317 template <>
1319 #ifdef EIGEN_VECTORIZE_AVX2
1320  return _mm256_and_si256(a, b);
1321 #else
1322  return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
1323 #endif
1324 }
1325 
1326 template <>
1328  return _mm256_or_ps(a, b);
1329 }
1330 template <>
1332  return _mm256_or_pd(a, b);
1333 }
1334 template <>
1336 #ifdef EIGEN_VECTORIZE_AVX2
1337  return _mm256_or_si256(a, b);
1338 #else
1339  return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
1340 #endif
1341 }
1342 template <>
1344 #ifdef EIGEN_VECTORIZE_AVX2
1345  return _mm256_or_si256(a, b);
1346 #else
1347  return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
1348 #endif
1349 }
1350 
1351 template <>
1353  return _mm256_xor_ps(a, b);
1354 }
1355 template <>
1357  return _mm256_xor_pd(a, b);
1358 }
1359 template <>
1361 #ifdef EIGEN_VECTORIZE_AVX2
1362  return _mm256_xor_si256(a, b);
1363 #else
1364  return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
1365 #endif
1366 }
1367 template <>
1369 #ifdef EIGEN_VECTORIZE_AVX2
1370  return _mm256_xor_si256(a, b);
1371 #else
1372  return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
1373 #endif
1374 }
1375 
1376 template <>
1378  return _mm256_andnot_ps(b, a);
1379 }
1380 template <>
1382  return _mm256_andnot_pd(b, a);
1383 }
1384 template <>
1386 #ifdef EIGEN_VECTORIZE_AVX2
1387  return _mm256_andnot_si256(b, a);
1388 #else
1389  return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(b), _mm256_castsi256_ps(a)));
1390 #endif
1391 }
1392 template <>
1394 #ifdef EIGEN_VECTORIZE_AVX2
1395  return _mm256_andnot_si256(b, a);
1396 #else
1397  return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(b), _mm256_castsi256_ps(a)));
1398 #endif
1399 }
1400 
1401 template <>
1403  return pxor(pcmp_eq(a, pmax(a, b)), ptrue(a));
1404 }
1405 template <>
1407  return pcmp_eq(a, pmin(a, b));
1408 }
1409 
1410 template <>
1412  const Packet8f mask = pset1frombits<Packet8f>(static_cast<numext::uint32_t>(0x80000000u));
1413  const Packet8f prev0dot5 = pset1frombits<Packet8f>(static_cast<numext::uint32_t>(0x3EFFFFFFu));
1414  return _mm256_round_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
1415 }
1416 template <>
1418  const Packet4d mask = pset1frombits<Packet4d>(static_cast<numext::uint64_t>(0x8000000000000000ull));
1419  const Packet4d prev0dot5 = pset1frombits<Packet4d>(static_cast<numext::uint64_t>(0x3FDFFFFFFFFFFFFFull));
1420  return _mm256_round_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
1421 }
1422 
1423 template <>
1425  return _mm256_blendv_ps(b, a, mask);
1426 }
1427 template <>
1429  return _mm256_castps_si256(
1430  _mm256_blendv_ps(_mm256_castsi256_ps(b), _mm256_castsi256_ps(a), _mm256_castsi256_ps(mask)));
1431 }
1432 template <>
1434  return _mm256_castps_si256(
1435  _mm256_blendv_ps(_mm256_castsi256_ps(b), _mm256_castsi256_ps(a), _mm256_castsi256_ps(mask)));
1436 }
1437 
1438 template <>
1440  return _mm256_blendv_pd(b, a, mask);
1441 }
1442 
1443 template <int N>
1445 #ifdef EIGEN_VECTORIZE_AVX2
1446  return _mm256_srai_epi32(a, N);
1447 #else
1448  __m128i lo = _mm_srai_epi32(_mm256_extractf128_si256(a, 0), N);
1449  __m128i hi = _mm_srai_epi32(_mm256_extractf128_si256(a, 1), N);
1450  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
1451 #endif
1452 }
1453 
1454 template <int N>
1456 #ifdef EIGEN_VECTORIZE_AVX2
1457  return _mm256_srli_epi32(a, N);
1458 #else
1459  __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(a, 0), N);
1460  __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(a, 1), N);
1461  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
1462 #endif
1463 }
1464 
1465 template <int N>
1467 #ifdef EIGEN_VECTORIZE_AVX2
1468  return _mm256_slli_epi32(a, N);
1469 #else
1470  __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(a, 0), N);
1471  __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(a, 1), N);
1472  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
1473 #endif
1474 }
1475 
1476 template <int N>
1478  return (Packet8ui)plogical_shift_right<N>((Packet8i)a);
1479 }
1480 template <int N>
1482  return (Packet8ui)plogical_shift_right<N>((Packet8i)a);
1483 }
1484 template <int N>
1486  return (Packet8ui)plogical_shift_left<N>((Packet8i)a);
1487 }
1488 
1489 template <>
1491  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from);
1492 }
1493 template <>
1495  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from);
1496 }
1497 template <>
1499  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
1500 }
1501 template <>
1503  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
1504 }
1505 
1506 template <>
1508  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_ps(from);
1509 }
1510 template <>
1512  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_pd(from);
1513 }
1514 template <>
1516  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
1517 }
1518 template <>
1520  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
1521 }
1522 
1523 template <>
1525 #ifdef EIGEN_VECTORIZE_AVX512
1526  __mmask16 mask = static_cast<__mmask16>(umask & 0x00FF);
1527  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_castps512_ps256(_mm512_maskz_loadu_ps(mask, from));
1528 #else
1529  Packet8i mask = _mm256_set1_epi8(static_cast<char>(umask));
1530  const Packet8i bit_mask =
1531  _mm256_set_epi32(0xffffff7f, 0xffffffbf, 0xffffffdf, 0xffffffef, 0xfffffff7, 0xfffffffb, 0xfffffffd, 0xfffffffe);
1532  mask = por<Packet8i>(mask, bit_mask);
1533  mask = pcmp_eq<Packet8i>(mask, _mm256_set1_epi32(0xffffffff));
1534  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_maskload_ps(from, mask);
1535 #endif
1536 }
1537 
1538 // Loads 4 floats from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, a3}
1539 template <>
1541  // TODO try to find a way to avoid the need of a temporary register
1542  // Packet8f tmp = _mm256_castps128_ps256(_mm_loadu_ps(from));
1543  // tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1);
1544  // return _mm256_unpacklo_ps(tmp,tmp);
1545 
1546  // _mm256_insertf128_ps is very slow on Haswell, thus:
1547  Packet8f tmp = _mm256_broadcast_ps((const __m128*)(const void*)from);
1548  // mimic an "inplace" permutation of the lower 128bits using a blend
1549  tmp = _mm256_blend_ps(
1550  tmp, _mm256_castps128_ps256(_mm_permute_ps(_mm256_castps256_ps128(tmp), _MM_SHUFFLE(1, 0, 1, 0))), 15);
1551  // then we can perform a consistent permutation on the global register to get everything in shape:
1552  return _mm256_permute_ps(tmp, _MM_SHUFFLE(3, 3, 2, 2));
1553 }
1554 // Loads 2 doubles from memory a returns the packet {a0, a0, a1, a1}
1555 template <>
1557  Packet4d tmp = _mm256_broadcast_pd((const __m128d*)(const void*)from);
1558  return _mm256_permute_pd(tmp, 3 << 2);
1559 }
1560 // Loads 4 integers from memory a returns the packet {a0, a0, a1, a1, a2, a2, a3, a3}
1561 template <>
1563 #ifdef EIGEN_VECTORIZE_AVX2
1564  const Packet8i a = _mm256_castsi128_si256(ploadu<Packet4i>(from));
1565  return _mm256_permutevar8x32_epi32(a, _mm256_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3));
1566 #else
1567  __m256 tmp = _mm256_broadcast_ps((const __m128*)(const void*)from);
1568  // mimic an "inplace" permutation of the lower 128bits using a blend
1569  tmp = _mm256_blend_ps(
1570  tmp, _mm256_castps128_ps256(_mm_permute_ps(_mm256_castps256_ps128(tmp), _MM_SHUFFLE(1, 0, 1, 0))), 15);
1571  // then we can perform a consistent permutation on the global register to get everything in shape:
1572  return _mm256_castps_si256(_mm256_permute_ps(tmp, _MM_SHUFFLE(3, 3, 2, 2)));
1573 #endif
1574 }
1575 template <>
1577 #ifdef EIGEN_VECTORIZE_AVX2
1578  const Packet8ui a = _mm256_castsi128_si256(ploadu<Packet4ui>(from));
1579  return _mm256_permutevar8x32_epi32(a, _mm256_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3));
1580 #else
1581  __m256 tmp = _mm256_broadcast_ps((const __m128*)(const void*)from);
1582  // mimic an "inplace" permutation of the lower 128bits using a blend
1583  tmp = _mm256_blend_ps(
1584  tmp, _mm256_castps128_ps256(_mm_permute_ps(_mm256_castps256_ps128(tmp), _MM_SHUFFLE(1, 0, 1, 0))), 15);
1585  // then we can perform a consistent permutation on the global register to get
1586  // everything in shape:
1587  return _mm256_castps_si256(_mm256_permute_ps(tmp, _MM_SHUFFLE(3, 3, 2, 2)));
1588 #endif
1589 }
1590 
1591 // Loads 2 floats from memory a returns the packet {a0, a0 a0, a0, a1, a1, a1, a1}
1592 template <>
1594  Packet8f tmp = _mm256_castps128_ps256(_mm_broadcast_ss(from));
1595  return _mm256_insertf128_ps(tmp, _mm_broadcast_ss(from + 1), 1);
1596 }
1597 template <>
1599  return _mm256_insertf128_si256(_mm256_set1_epi32(*from), _mm_set1_epi32(*(from + 1)), 1);
1600 }
1601 template <>
1603  return _mm256_insertf128_si256(_mm256_set1_epi32(*from), _mm_set1_epi32(*(from + 1)), 1);
1604 }
1605 
1606 template <>
1607 EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet8f& from) {
1608  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(to, from);
1609 }
1610 template <>
1611 EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet4d& from) {
1612  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_pd(to, from);
1613 }
1614 template <>
1615 EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet8i& from) {
1616  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), from);
1617 }
1618 template <>
1620  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), from);
1621 }
1622 
1623 template <>
1624 EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet8f& from) {
1625  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ps(to, from);
1626 }
1627 template <>
1628 EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet4d& from) {
1629  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from);
1630 }
1631 template <>
1632 EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet8i& from) {
1633  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from);
1634 }
1635 template <>
1637  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from);
1638 }
1639 
1640 template <>
1641 EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet8f& from, uint8_t umask) {
1642 #ifdef EIGEN_VECTORIZE_AVX512
1643  __mmask16 mask = static_cast<__mmask16>(umask & 0x00FF);
1644  EIGEN_DEBUG_UNALIGNED_STORE _mm512_mask_storeu_ps(to, mask, _mm512_castps256_ps512(from));
1645 #else
1646  Packet8i mask = _mm256_set1_epi8(static_cast<char>(umask));
1647  const Packet8i bit_mask =
1648  _mm256_set_epi32(0x7f7f7f7f, 0xbfbfbfbf, 0xdfdfdfdf, 0xefefefef, 0xf7f7f7f7, 0xfbfbfbfb, 0xfdfdfdfd, 0xfefefefe);
1649  mask = por<Packet8i>(mask, bit_mask);
1650  mask = pcmp_eq<Packet8i>(mask, _mm256_set1_epi32(0xffffffff));
1651 #if EIGEN_COMP_MSVC
1652  // MSVC sometimes seems to use a bogus mask with maskstore.
1653  const __m256i ifrom = _mm256_castps_si256(from);
1654  EIGEN_DEBUG_UNALIGNED_STORE _mm_maskmoveu_si128(_mm256_extractf128_si256(ifrom, 0), _mm256_extractf128_si256(mask, 0),
1655  reinterpret_cast<char*>(to));
1656  EIGEN_DEBUG_UNALIGNED_STORE _mm_maskmoveu_si128(_mm256_extractf128_si256(ifrom, 1), _mm256_extractf128_si256(mask, 1),
1657  reinterpret_cast<char*>(to + 4));
1658 #else
1659  EIGEN_DEBUG_UNALIGNED_STORE _mm256_maskstore_ps(to, mask, from);
1660 #endif
1661 #endif
1662 }
1663 
1664 // NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available
1665 // NOTE: for the record the following seems to be slower: return _mm256_i32gather_ps(from, _mm256_set1_epi32(stride),
1666 // 4);
1667 template <>
1668 EIGEN_DEVICE_FUNC inline Packet8f pgather<float, Packet8f>(const float* from, Index stride) {
1669  return _mm256_set_ps(from[7 * stride], from[6 * stride], from[5 * stride], from[4 * stride], from[3 * stride],
1670  from[2 * stride], from[1 * stride], from[0 * stride]);
1671 }
1672 template <>
1673 EIGEN_DEVICE_FUNC inline Packet4d pgather<double, Packet4d>(const double* from, Index stride) {
1674  return _mm256_set_pd(from[3 * stride], from[2 * stride], from[1 * stride], from[0 * stride]);
1675 }
1676 template <>
1677 EIGEN_DEVICE_FUNC inline Packet8i pgather<int, Packet8i>(const int* from, Index stride) {
1678  return _mm256_set_epi32(from[7 * stride], from[6 * stride], from[5 * stride], from[4 * stride], from[3 * stride],
1679  from[2 * stride], from[1 * stride], from[0 * stride]);
1680 }
1681 template <>
1683  return (Packet8ui)pgather<int, Packet8i>((int*)from, stride);
1684 }
1685 
1686 template <>
1687 EIGEN_DEVICE_FUNC inline void pscatter<float, Packet8f>(float* to, const Packet8f& from, Index stride) {
1688  __m128 low = _mm256_extractf128_ps(from, 0);
1689  to[stride * 0] = _mm_cvtss_f32(low);
1690  to[stride * 1] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1));
1691  to[stride * 2] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 2));
1692  to[stride * 3] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3));
1693 
1694  __m128 high = _mm256_extractf128_ps(from, 1);
1695  to[stride * 4] = _mm_cvtss_f32(high);
1696  to[stride * 5] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1));
1697  to[stride * 6] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 2));
1698  to[stride * 7] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3));
1699 }
1700 template <>
1701 EIGEN_DEVICE_FUNC inline void pscatter<double, Packet4d>(double* to, const Packet4d& from, Index stride) {
1702  __m128d low = _mm256_extractf128_pd(from, 0);
1703  to[stride * 0] = _mm_cvtsd_f64(low);
1704  to[stride * 1] = _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1));
1705  __m128d high = _mm256_extractf128_pd(from, 1);
1706  to[stride * 2] = _mm_cvtsd_f64(high);
1707  to[stride * 3] = _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1));
1708 }
1709 template <>
1710 EIGEN_DEVICE_FUNC inline void pscatter<int, Packet8i>(int* to, const Packet8i& from, Index stride) {
1711  __m128i low = _mm256_extractf128_si256(from, 0);
1712  to[stride * 0] = _mm_extract_epi32(low, 0);
1713  to[stride * 1] = _mm_extract_epi32(low, 1);
1714  to[stride * 2] = _mm_extract_epi32(low, 2);
1715  to[stride * 3] = _mm_extract_epi32(low, 3);
1716 
1717  __m128i high = _mm256_extractf128_si256(from, 1);
1718  to[stride * 4] = _mm_extract_epi32(high, 0);
1719  to[stride * 5] = _mm_extract_epi32(high, 1);
1720  to[stride * 6] = _mm_extract_epi32(high, 2);
1721  to[stride * 7] = _mm_extract_epi32(high, 3);
1722 }
1723 template <>
1725  pscatter<int, Packet8i>((int*)to, (Packet8i)from, stride);
1726 }
1727 
1728 template <>
1729 EIGEN_STRONG_INLINE void pstore1<Packet8f>(float* to, const float& a) {
1731  pstore(to, pa);
1732 }
1733 template <>
1734 EIGEN_STRONG_INLINE void pstore1<Packet4d>(double* to, const double& a) {
1736  pstore(to, pa);
1737 }
1738 template <>
1739 EIGEN_STRONG_INLINE void pstore1<Packet8i>(int* to, const int& a) {
1741  pstore(to, pa);
1742 }
1743 
1744 #ifndef EIGEN_VECTORIZE_AVX512
1745 template <>
1746 EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
1747  _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
1748 }
1749 template <>
1750 EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
1751  _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
1752 }
1753 template <>
1754 EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) {
1755  _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
1756 }
1757 template <>
1759  _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
1760 }
1761 #endif
1762 
1763 template <>
1765  return _mm_cvtss_f32(_mm256_castps256_ps128(a));
1766 }
1767 template <>
1769  return _mm_cvtsd_f64(_mm256_castpd256_pd128(a));
1770 }
1771 template <>
1773  return _mm_cvtsi128_si32(_mm256_castsi256_si128(a));
1774 }
1775 template <>
1777  return numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm256_castsi256_si128(a)));
1778 }
1779 
1780 template <>
1782  __m256 tmp = _mm256_shuffle_ps(a, a, 0x1b);
1783  return _mm256_permute2f128_ps(tmp, tmp, 1);
1784 }
1785 template <>
1787  __m256d tmp = _mm256_shuffle_pd(a, a, 5);
1788  return _mm256_permute2f128_pd(tmp, tmp, 1);
1789 #if 0
1790  // This version is unlikely to be faster as _mm256_shuffle_ps and _mm256_permute_pd
1791  // exhibit the same latency/throughput, but it is here for future reference/benchmarking...
1792  __m256d swap_halves = _mm256_permute2f128_pd(a,a,1);
1793  return _mm256_permute_pd(swap_halves,5);
1794 #endif
1795 }
1796 template <>
1798  return _mm256_castps_si256(preverse(_mm256_castsi256_ps(a)));
1799 }
1800 template <>
1802  return _mm256_castps_si256(preverse(_mm256_castsi256_ps(a)));
1803 }
1804 
1805 #ifdef EIGEN_VECTORIZE_AVX2
1806 template <>
1807 EIGEN_STRONG_INLINE Packet4l preverse(const Packet4l& a) {
1808  return _mm256_castpd_si256(preverse(_mm256_castsi256_pd(a)));
1809 }
1810 template <>
1811 EIGEN_STRONG_INLINE Packet4ul preverse(const Packet4ul& a) {
1812  return _mm256_castpd_si256(preverse(_mm256_castsi256_pd(a)));
1813 }
1814 #endif
1815 
1816 // pabs should be ok
1817 template <>
1819  const Packet8f mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
1820  return _mm256_and_ps(a, mask);
1821 }
1822 template <>
1824  const Packet4d mask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x7FFFFFFFFFFFFFFF));
1825  return _mm256_and_pd(a, mask);
1826 }
1827 template <>
1829 #ifdef EIGEN_VECTORIZE_AVX2
1830  return _mm256_abs_epi32(a);
1831 #else
1832  __m128i lo = _mm_abs_epi32(_mm256_extractf128_si256(a, 0));
1833  __m128i hi = _mm_abs_epi32(_mm256_extractf128_si256(a, 1));
1834  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
1835 #endif
1836 }
1837 template <>
1839  return a;
1840 }
1841 
1842 template <>
1844  return _mm_cmpgt_epi16(_mm_setzero_si128(), a);
1845 }
1846 template <>
1848  return _mm_cmpgt_epi16(_mm_setzero_si128(), a);
1849 }
1850 template <>
1852 #ifdef EIGEN_VECTORIZE_AVX2
1853  return _mm256_castsi256_ps(_mm256_cmpgt_epi32(_mm256_setzero_si256(), _mm256_castps_si256(a)));
1854 #else
1855  return _mm256_castsi256_ps(parithmetic_shift_right<31>(Packet8i(_mm256_castps_si256(a))));
1856 #endif
1857 }
1858 template <>
1860  return _mm256_setzero_si256();
1861 }
1862 #ifdef EIGEN_VECTORIZE_AVX2
1863 template <>
1865  return _mm256_castsi256_pd(_mm256_cmpgt_epi64(_mm256_setzero_si256(), _mm256_castpd_si256(a)));
1866 }
1867 template <>
1868 EIGEN_STRONG_INLINE Packet4ul psignbit(const Packet4ul& /*unused*/) {
1869  return _mm256_setzero_si256();
1870 }
1871 #endif
1872 
1873 template <>
1875  return pfrexp_generic(a, exponent);
1876 }
1877 
1878 // Extract exponent without existence of Packet4l.
1879 template <>
1881  const Packet4d cst_exp_mask = pset1frombits<Packet4d>(static_cast<uint64_t>(0x7ff0000000000000ull));
1882  __m256i a_expo = _mm256_castpd_si256(pand(a, cst_exp_mask));
1883 #ifdef EIGEN_VECTORIZE_AVX2
1884  a_expo = _mm256_srli_epi64(a_expo, 52);
1885  __m128i lo = _mm256_extractf128_si256(a_expo, 0);
1886  __m128i hi = _mm256_extractf128_si256(a_expo, 1);
1887 #else
1888  __m128i lo = _mm256_extractf128_si256(a_expo, 0);
1889  __m128i hi = _mm256_extractf128_si256(a_expo, 1);
1890  lo = _mm_srli_epi64(lo, 52);
1891  hi = _mm_srli_epi64(hi, 52);
1892 #endif
1893  Packet2d exponent_lo = _mm_cvtepi32_pd(vec4i_swizzle1(lo, 0, 2, 1, 3));
1894  Packet2d exponent_hi = _mm_cvtepi32_pd(vec4i_swizzle1(hi, 0, 2, 1, 3));
1895  Packet4d exponent = _mm256_insertf128_pd(_mm256_setzero_pd(), exponent_lo, 0);
1896  exponent = _mm256_insertf128_pd(exponent, exponent_hi, 1);
1897  return exponent;
1898 }
1899 
1900 template <>
1902  return pfrexp_generic(a, exponent);
1903 }
1904 
1905 template <>
1907  return pldexp_generic(a, exponent);
1908 }
1909 
1910 template <>
1912  // Clamp exponent to [-2099, 2099]
1913  const Packet4d max_exponent = pset1<Packet4d>(2099.0);
1914  const Packet4i e = _mm256_cvtpd_epi32(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
1915 
1916  // Split 2^e into four factors and multiply.
1917  const Packet4i bias = pset1<Packet4i>(1023);
1918  Packet4i b = parithmetic_shift_right<2>(e); // floor(e/4)
1919 
1920  // 2^b
1921  Packet4i hi = vec4i_swizzle1(padd(b, bias), 0, 2, 1, 3);
1922  Packet4i lo = _mm_slli_epi64(hi, 52);
1923  hi = _mm_slli_epi64(_mm_srli_epi64(hi, 32), 52);
1924  Packet4d c = _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1));
1925  Packet4d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
1926 
1927  // 2^(e - 3b)
1928  b = psub(psub(psub(e, b), b), b); // e - 3b
1929  hi = vec4i_swizzle1(padd(b, bias), 0, 2, 1, 3);
1930  lo = _mm_slli_epi64(hi, 52);
1931  hi = _mm_slli_epi64(_mm_srli_epi64(hi, 32), 52);
1932  c = _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1));
1933  out = pmul(out, c); // a * 2^e
1934  return out;
1935 }
1936 
1937 template <>
1939  // Clamp exponent to [-1024, 1024]
1940  const Packet4d min_exponent = pset1<Packet4d>(-1023.0);
1941  const Packet4d max_exponent = pset1<Packet4d>(1024.0);
1942  const Packet4i e = _mm256_cvtpd_epi32(pmin(pmax(exponent, min_exponent), max_exponent));
1943  const Packet4i bias = pset1<Packet4i>(1023);
1944 
1945  // 2^e
1946  Packet4i hi = vec4i_swizzle1(padd(e, bias), 0, 2, 1, 3);
1947  const Packet4i lo = _mm_slli_epi64(hi, 52);
1948  hi = _mm_slli_epi64(_mm_srli_epi64(hi, 32), 52);
1949  const Packet4d c = _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1));
1950  return pmul(a, c); // a * 2^e
1951 }
1952 
1953 template <>
1955  return predux(Packet4f(_mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 1))));
1956 }
1957 template <>
1959  return predux(Packet2d(_mm_add_pd(_mm256_castpd256_pd128(a), _mm256_extractf128_pd(a, 1))));
1960 }
1961 template <>
1963  return predux(Packet4i(_mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1))));
1964 }
1965 template <>
1967  return predux(Packet4ui(_mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1))));
1968 }
1969 
1970 template <>
1972  return _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 1));
1973 }
1974 template <>
1976  return _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
1977 }
1978 template <>
1980  return _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
1981 }
1982 
1983 template <>
1985  Packet8f tmp;
1986  tmp = _mm256_mul_ps(a, _mm256_permute2f128_ps(a, a, 1));
1987  tmp = _mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp, tmp, _MM_SHUFFLE(1, 0, 3, 2)));
1988  return pfirst(_mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp, tmp, 1)));
1989 }
1990 template <>
1992  Packet4d tmp;
1993  tmp = _mm256_mul_pd(a, _mm256_permute2f128_pd(a, a, 1));
1994  return pfirst(_mm256_mul_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
1995 }
1996 
1997 template <>
1999  Packet8f tmp = _mm256_min_ps(a, _mm256_permute2f128_ps(a, a, 1));
2000  tmp = _mm256_min_ps(tmp, _mm256_shuffle_ps(tmp, tmp, _MM_SHUFFLE(1, 0, 3, 2)));
2001  return pfirst(_mm256_min_ps(tmp, _mm256_shuffle_ps(tmp, tmp, 1)));
2002 }
2003 template <>
2005  Packet4d tmp = _mm256_min_pd(a, _mm256_permute2f128_pd(a, a, 1));
2006  return pfirst(_mm256_min_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
2007 }
2008 
2009 template <>
2011  Packet8f tmp = _mm256_max_ps(a, _mm256_permute2f128_ps(a, a, 1));
2012  tmp = _mm256_max_ps(tmp, _mm256_shuffle_ps(tmp, tmp, _MM_SHUFFLE(1, 0, 3, 2)));
2013  return pfirst(_mm256_max_ps(tmp, _mm256_shuffle_ps(tmp, tmp, 1)));
2014 }
2015 
2016 template <>
2018  Packet4d tmp = _mm256_max_pd(a, _mm256_permute2f128_pd(a, a, 1));
2019  return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
2020 }
2021 
2022 // not needed yet
2023 // template<> EIGEN_STRONG_INLINE bool predux_all(const Packet8f& x)
2024 // {
2025 // return _mm256_movemask_ps(x)==0xFF;
2026 // }
2027 
2028 template <>
2030  return _mm256_movemask_ps(x) != 0;
2031 }
2032 
2033 template <>
2035  return _mm256_movemask_pd(x) != 0;
2036 }
2037 
2038 template <>
2040  return _mm256_movemask_ps(_mm256_castsi256_ps(x)) != 0;
2041 }
2042 template <>
2044  return _mm256_movemask_ps(_mm256_castsi256_ps(x)) != 0;
2045 }
2046 
2047 template <>
2049  return _mm_movemask_epi8(x) != 0;
2050 }
2051 template <>
2053  return _mm_movemask_epi8(x) != 0;
2054 }
2055 
2057  __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
2058  __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
2059  __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
2060  __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
2061  __m256 T4 = _mm256_unpacklo_ps(kernel.packet[4], kernel.packet[5]);
2062  __m256 T5 = _mm256_unpackhi_ps(kernel.packet[4], kernel.packet[5]);
2063  __m256 T6 = _mm256_unpacklo_ps(kernel.packet[6], kernel.packet[7]);
2064  __m256 T7 = _mm256_unpackhi_ps(kernel.packet[6], kernel.packet[7]);
2065  __m256 S0 = _mm256_shuffle_ps(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
2066  __m256 S1 = _mm256_shuffle_ps(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
2067  __m256 S2 = _mm256_shuffle_ps(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
2068  __m256 S3 = _mm256_shuffle_ps(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
2069  __m256 S4 = _mm256_shuffle_ps(T4, T6, _MM_SHUFFLE(1, 0, 1, 0));
2070  __m256 S5 = _mm256_shuffle_ps(T4, T6, _MM_SHUFFLE(3, 2, 3, 2));
2071  __m256 S6 = _mm256_shuffle_ps(T5, T7, _MM_SHUFFLE(1, 0, 1, 0));
2072  __m256 S7 = _mm256_shuffle_ps(T5, T7, _MM_SHUFFLE(3, 2, 3, 2));
2073  kernel.packet[0] = _mm256_permute2f128_ps(S0, S4, 0x20);
2074  kernel.packet[1] = _mm256_permute2f128_ps(S1, S5, 0x20);
2075  kernel.packet[2] = _mm256_permute2f128_ps(S2, S6, 0x20);
2076  kernel.packet[3] = _mm256_permute2f128_ps(S3, S7, 0x20);
2077  kernel.packet[4] = _mm256_permute2f128_ps(S0, S4, 0x31);
2078  kernel.packet[5] = _mm256_permute2f128_ps(S1, S5, 0x31);
2079  kernel.packet[6] = _mm256_permute2f128_ps(S2, S6, 0x31);
2080  kernel.packet[7] = _mm256_permute2f128_ps(S3, S7, 0x31);
2081 }
2082 
2084  __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
2085  __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
2086  __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
2087  __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
2088 
2089  __m256 S0 = _mm256_shuffle_ps(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
2090  __m256 S1 = _mm256_shuffle_ps(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
2091  __m256 S2 = _mm256_shuffle_ps(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
2092  __m256 S3 = _mm256_shuffle_ps(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
2093 
2094  kernel.packet[0] = _mm256_permute2f128_ps(S0, S1, 0x20);
2095  kernel.packet[1] = _mm256_permute2f128_ps(S2, S3, 0x20);
2096  kernel.packet[2] = _mm256_permute2f128_ps(S0, S1, 0x31);
2097  kernel.packet[3] = _mm256_permute2f128_ps(S2, S3, 0x31);
2098 }
2099 
2100 #define MM256_SHUFFLE_EPI32(A, B, M) \
2101  _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(A), _mm256_castsi256_ps(B), M))
2102 
2103 #ifndef EIGEN_VECTORIZE_AVX2
2104 #define MM256_UNPACKLO_EPI32(A, B) \
2105  _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(A), _mm256_castsi256_ps(B)))
2106 #define MM256_UNPACKHI_EPI32(A, B) \
2107  _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(A), _mm256_castsi256_ps(B)))
2108 #else
2109 #define MM256_UNPACKLO_EPI32(A, B) _mm256_unpacklo_epi32(A, B)
2110 #define MM256_UNPACKHI_EPI32(A, B) _mm256_unpackhi_epi32(A, B)
2111 #endif
2112 
2114  __m256i T0 = MM256_UNPACKLO_EPI32(kernel.packet[0], kernel.packet[1]);
2115  __m256i T1 = MM256_UNPACKHI_EPI32(kernel.packet[0], kernel.packet[1]);
2116  __m256i T2 = MM256_UNPACKLO_EPI32(kernel.packet[2], kernel.packet[3]);
2117  __m256i T3 = MM256_UNPACKHI_EPI32(kernel.packet[2], kernel.packet[3]);
2118  __m256i T4 = MM256_UNPACKLO_EPI32(kernel.packet[4], kernel.packet[5]);
2119  __m256i T5 = MM256_UNPACKHI_EPI32(kernel.packet[4], kernel.packet[5]);
2120  __m256i T6 = MM256_UNPACKLO_EPI32(kernel.packet[6], kernel.packet[7]);
2121  __m256i T7 = MM256_UNPACKHI_EPI32(kernel.packet[6], kernel.packet[7]);
2122  __m256i S0 = MM256_SHUFFLE_EPI32(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
2123  __m256i S1 = MM256_SHUFFLE_EPI32(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
2124  __m256i S2 = MM256_SHUFFLE_EPI32(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
2125  __m256i S3 = MM256_SHUFFLE_EPI32(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
2126  __m256i S4 = MM256_SHUFFLE_EPI32(T4, T6, _MM_SHUFFLE(1, 0, 1, 0));
2127  __m256i S5 = MM256_SHUFFLE_EPI32(T4, T6, _MM_SHUFFLE(3, 2, 3, 2));
2128  __m256i S6 = MM256_SHUFFLE_EPI32(T5, T7, _MM_SHUFFLE(1, 0, 1, 0));
2129  __m256i S7 = MM256_SHUFFLE_EPI32(T5, T7, _MM_SHUFFLE(3, 2, 3, 2));
2130  kernel.packet[0] = _mm256_permute2f128_si256(S0, S4, 0x20);
2131  kernel.packet[1] = _mm256_permute2f128_si256(S1, S5, 0x20);
2132  kernel.packet[2] = _mm256_permute2f128_si256(S2, S6, 0x20);
2133  kernel.packet[3] = _mm256_permute2f128_si256(S3, S7, 0x20);
2134  kernel.packet[4] = _mm256_permute2f128_si256(S0, S4, 0x31);
2135  kernel.packet[5] = _mm256_permute2f128_si256(S1, S5, 0x31);
2136  kernel.packet[6] = _mm256_permute2f128_si256(S2, S6, 0x31);
2137  kernel.packet[7] = _mm256_permute2f128_si256(S3, S7, 0x31);
2138 }
2141 }
2142 
2144  __m256i T0 = MM256_UNPACKLO_EPI32(kernel.packet[0], kernel.packet[1]);
2145  __m256i T1 = MM256_UNPACKHI_EPI32(kernel.packet[0], kernel.packet[1]);
2146  __m256i T2 = MM256_UNPACKLO_EPI32(kernel.packet[2], kernel.packet[3]);
2147  __m256i T3 = MM256_UNPACKHI_EPI32(kernel.packet[2], kernel.packet[3]);
2148 
2149  __m256i S0 = MM256_SHUFFLE_EPI32(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
2150  __m256i S1 = MM256_SHUFFLE_EPI32(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
2151  __m256i S2 = MM256_SHUFFLE_EPI32(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
2152  __m256i S3 = MM256_SHUFFLE_EPI32(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
2153 
2154  kernel.packet[0] = _mm256_permute2f128_si256(S0, S1, 0x20);
2155  kernel.packet[1] = _mm256_permute2f128_si256(S2, S3, 0x20);
2156  kernel.packet[2] = _mm256_permute2f128_si256(S0, S1, 0x31);
2157  kernel.packet[3] = _mm256_permute2f128_si256(S2, S3, 0x31);
2158 }
2161 }
2162 
2164  __m256d T0 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 15);
2165  __m256d T1 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 0);
2166  __m256d T2 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 15);
2167  __m256d T3 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 0);
2168 
2169  kernel.packet[1] = _mm256_permute2f128_pd(T0, T2, 32);
2170  kernel.packet[3] = _mm256_permute2f128_pd(T0, T2, 49);
2171  kernel.packet[0] = _mm256_permute2f128_pd(T1, T3, 32);
2172  kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49);
2173 }
2174 
2176  return _mm256_set_epi64x(0 - ifPacket.select[3], 0 - ifPacket.select[2], 0 - ifPacket.select[1],
2177  0 - ifPacket.select[0]);
2178 }
2179 
2181  return _mm256_set_epi32(0 - ifPacket.select[7], 0 - ifPacket.select[6], 0 - ifPacket.select[5],
2182  0 - ifPacket.select[4], 0 - ifPacket.select[3], 0 - ifPacket.select[2],
2183  0 - ifPacket.select[1], 0 - ifPacket.select[0]);
2184 }
2185 
2186 template <>
2187 EIGEN_STRONG_INLINE Packet8f pblend(const Selector<8>& ifPacket, const Packet8f& thenPacket,
2188  const Packet8f& elsePacket) {
2189  const __m256 true_mask = _mm256_castsi256_ps(avx_blend_mask(ifPacket));
2190  return pselect<Packet8f>(true_mask, thenPacket, elsePacket);
2191 }
2192 
2193 template <>
2194 EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, const Packet4d& thenPacket,
2195  const Packet4d& elsePacket) {
2196  const __m256d true_mask = _mm256_castsi256_pd(avx_blend_mask(ifPacket));
2197  return pselect<Packet4d>(true_mask, thenPacket, elsePacket);
2198 }
2199 
2200 // Packet math for Eigen::half
2201 #ifndef EIGEN_VECTORIZE_AVX512FP16
2202 template <>
2205  enum {
2206  size = 8,
2210  masked_store_available = false
2211  };
2212  typedef Packet8h half;
2213 };
2214 #endif
2215 
2216 template <>
2218  return _mm_set1_epi16(numext::bit_cast<numext::uint16_t>(from));
2219 }
2220 
2221 template <>
2223  return numext::bit_cast<Eigen::half>(static_cast<numext::uint16_t>(_mm_extract_epi16(from, 0)));
2224 }
2225 
2226 template <>
2228  return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
2229 }
2230 
2231 template <>
2233  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
2234 }
2235 
2236 template <>
2237 EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8h& from) {
2238  _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
2239 }
2240 
2241 template <>
2242 EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet8h& from) {
2243  _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
2244 }
2245 
2246 template <>
2248  const numext::uint16_t a = numext::bit_cast<numext::uint16_t>(from[0]);
2249  const numext::uint16_t b = numext::bit_cast<numext::uint16_t>(from[1]);
2250  const numext::uint16_t c = numext::bit_cast<numext::uint16_t>(from[2]);
2251  const numext::uint16_t d = numext::bit_cast<numext::uint16_t>(from[3]);
2252  return _mm_set_epi16(d, d, c, c, b, b, a, a);
2253 }
2254 
2255 template <>
2257  const numext::uint16_t a = numext::bit_cast<numext::uint16_t>(from[0]);
2258  const numext::uint16_t b = numext::bit_cast<numext::uint16_t>(from[1]);
2259  return _mm_set_epi16(b, b, b, b, a, a, a, a);
2260 }
2261 
2262 template <>
2264  return _mm_cmpeq_epi32(a, a);
2265 }
2266 
2267 template <>
2269  const __m128i sign_mask = _mm_set1_epi16(static_cast<numext::uint16_t>(0x8000));
2270  return _mm_andnot_si128(sign_mask, a);
2271 }
2272 
2274 #ifdef EIGEN_HAS_FP16_C
2275  return _mm256_cvtph_ps(a);
2276 #else
2277  Eigen::internal::Packet8f pp = _mm256_castsi256_ps(
2278  _mm256_insertf128_si256(_mm256_castsi128_si256(half2floatsse(a)), half2floatsse(_mm_srli_si128(a, 8)), 1));
2279  return pp;
2280 #endif
2281 }
2282 
2284 #ifdef EIGEN_HAS_FP16_C
2285  return _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT);
2286 #else
2287  __m128i lo = float2half(_mm256_extractf128_ps(a, 0));
2288  __m128i hi = float2half(_mm256_extractf128_ps(a, 1));
2289  return _mm_packus_epi32(lo, hi);
2290 #endif
2291 }
2292 
2293 template <>
2296 }
2297 
2298 template <>
2301 }
2302 
2303 template <>
2305  return float2half(plset<Packet8f>(static_cast<float>(a)));
2306 }
2307 
2308 template <>
2310  // in some cases Packet4i is a wrapper around __m128i, so we either need to
2311  // cast to Packet4i to directly call the intrinsics as below:
2312  return _mm_or_si128(a, b);
2313 }
2314 template <>
2316  return _mm_xor_si128(a, b);
2317 }
2318 template <>
2320  return _mm_and_si128(a, b);
2321 }
2322 template <>
2324  return _mm_andnot_si128(b, a);
2325 }
2326 
2327 template <>
2329  return _mm_blendv_epi8(b, a, mask);
2330 }
2331 
2332 template <>
2335 }
2336 
2337 template <>
2340 }
2341 
2342 template <>
2345 }
2346 
2347 template <>
2350 }
2351 
2352 template <>
2355 }
2356 
2357 template <>
2359  return Pack16To8(pcmp_eq(half2float(a), half2float(b)));
2360 }
2361 
2362 template <>
2364  return Pack16To8(pcmp_le(half2float(a), half2float(b)));
2365 }
2366 
2367 template <>
2369  return Pack16To8(pcmp_lt(half2float(a), half2float(b)));
2370 }
2371 
2372 template <>
2375 }
2376 
2377 template <>
2379  return a;
2380 }
2381 
2382 template <>
2384  Packet8h sign_mask = _mm_set1_epi16(static_cast<numext::uint16_t>(0x8000));
2385  return _mm_xor_si128(a, sign_mask);
2386 }
2387 
2388 #ifndef EIGEN_VECTORIZE_AVX512FP16
2389 template <>
2391  Packet8f af = half2float(a);
2392  Packet8f bf = half2float(b);
2393  Packet8f rf = padd(af, bf);
2394  return float2half(rf);
2395 }
2396 
2397 template <>
2399  Packet8f af = half2float(a);
2400  Packet8f bf = half2float(b);
2401  Packet8f rf = psub(af, bf);
2402  return float2half(rf);
2403 }
2404 
2405 template <>
2407  Packet8f af = half2float(a);
2408  Packet8f bf = half2float(b);
2409  Packet8f rf = pmul(af, bf);
2410  return float2half(rf);
2411 }
2412 
2413 template <>
2415  Packet8f af = half2float(a);
2416  Packet8f bf = half2float(b);
2417  Packet8f rf = pdiv(af, bf);
2418  return float2half(rf);
2419 }
2420 #endif
2421 
2422 template <>
2423 EIGEN_STRONG_INLINE Packet8h pgather<Eigen::half, Packet8h>(const Eigen::half* from, Index stride) {
2424  const numext::uint16_t s0 = numext::bit_cast<numext::uint16_t>(from[0 * stride]);
2425  const numext::uint16_t s1 = numext::bit_cast<numext::uint16_t>(from[1 * stride]);
2426  const numext::uint16_t s2 = numext::bit_cast<numext::uint16_t>(from[2 * stride]);
2427  const numext::uint16_t s3 = numext::bit_cast<numext::uint16_t>(from[3 * stride]);
2428  const numext::uint16_t s4 = numext::bit_cast<numext::uint16_t>(from[4 * stride]);
2429  const numext::uint16_t s5 = numext::bit_cast<numext::uint16_t>(from[5 * stride]);
2430  const numext::uint16_t s6 = numext::bit_cast<numext::uint16_t>(from[6 * stride]);
2431  const numext::uint16_t s7 = numext::bit_cast<numext::uint16_t>(from[7 * stride]);
2432  return _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0);
2433 }
2434 
2435 template <>
2436 EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8h>(Eigen::half* to, const Packet8h& from, Index stride) {
2437  EIGEN_ALIGN32 Eigen::half aux[8];
2438  pstore(aux, from);
2439  to[stride * 0] = aux[0];
2440  to[stride * 1] = aux[1];
2441  to[stride * 2] = aux[2];
2442  to[stride * 3] = aux[3];
2443  to[stride * 4] = aux[4];
2444  to[stride * 5] = aux[5];
2445  to[stride * 6] = aux[6];
2446  to[stride * 7] = aux[7];
2447 }
2448 
2449 #ifndef EIGEN_VECTORIZE_AVX512FP16
2450 template <>
2452  Packet8f af = half2float(a);
2453  float reduced = predux<Packet8f>(af);
2454  return Eigen::half(reduced);
2455 }
2456 #endif
2457 
2458 template <>
2460  Packet8f af = half2float(a);
2461  float reduced = predux_max<Packet8f>(af);
2462  return Eigen::half(reduced);
2463 }
2464 
2465 template <>
2467  Packet8f af = half2float(a);
2468  float reduced = predux_min<Packet8f>(af);
2469  return Eigen::half(reduced);
2470 }
2471 
2472 template <>
2474  Packet8f af = half2float(a);
2475  float reduced = predux_mul<Packet8f>(af);
2476  return Eigen::half(reduced);
2477 }
2478 
2479 template <>
2481  __m128i m = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
2482  return _mm_shuffle_epi8(a, m);
2483 }
2484 
2486  __m128i a = kernel.packet[0];
2487  __m128i b = kernel.packet[1];
2488  __m128i c = kernel.packet[2];
2489  __m128i d = kernel.packet[3];
2490  __m128i e = kernel.packet[4];
2491  __m128i f = kernel.packet[5];
2492  __m128i g = kernel.packet[6];
2493  __m128i h = kernel.packet[7];
2494 
2495  __m128i a03b03 = _mm_unpacklo_epi16(a, b);
2496  __m128i c03d03 = _mm_unpacklo_epi16(c, d);
2497  __m128i e03f03 = _mm_unpacklo_epi16(e, f);
2498  __m128i g03h03 = _mm_unpacklo_epi16(g, h);
2499  __m128i a47b47 = _mm_unpackhi_epi16(a, b);
2500  __m128i c47d47 = _mm_unpackhi_epi16(c, d);
2501  __m128i e47f47 = _mm_unpackhi_epi16(e, f);
2502  __m128i g47h47 = _mm_unpackhi_epi16(g, h);
2503 
2504  __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03);
2505  __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03);
2506  __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03);
2507  __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03);
2508  __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47);
2509  __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47);
2510  __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47);
2511  __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47);
2512 
2513  __m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01);
2514  __m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01);
2515  __m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23);
2516  __m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23);
2517  __m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45);
2518  __m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45);
2519  __m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67);
2520  __m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67);
2521 
2522  kernel.packet[0] = a0b0c0d0e0f0g0h0;
2523  kernel.packet[1] = a1b1c1d1e1f1g1h1;
2524  kernel.packet[2] = a2b2c2d2e2f2g2h2;
2525  kernel.packet[3] = a3b3c3d3e3f3g3h3;
2526  kernel.packet[4] = a4b4c4d4e4f4g4h4;
2527  kernel.packet[5] = a5b5c5d5e5f5g5h5;
2528  kernel.packet[6] = a6b6c6d6e6f6g6h6;
2529  kernel.packet[7] = a7b7c7d7e7f7g7h7;
2530 }
2531 
2533  EIGEN_ALIGN32 Eigen::half in[4][8];
2534  pstore<Eigen::half>(in[0], kernel.packet[0]);
2535  pstore<Eigen::half>(in[1], kernel.packet[1]);
2536  pstore<Eigen::half>(in[2], kernel.packet[2]);
2537  pstore<Eigen::half>(in[3], kernel.packet[3]);
2538 
2540 
2541  for (int i = 0; i < 4; ++i) {
2542  for (int j = 0; j < 4; ++j) {
2543  out[i][j] = in[j][2 * i];
2544  }
2545  for (int j = 0; j < 4; ++j) {
2546  out[i][j + 4] = in[j][2 * i + 1];
2547  }
2548  }
2549 
2550  kernel.packet[0] = pload<Packet8h>(out[0]);
2551  kernel.packet[1] = pload<Packet8h>(out[1]);
2552  kernel.packet[2] = pload<Packet8h>(out[2]);
2553  kernel.packet[3] = pload<Packet8h>(out[3]);
2554 }
2555 
2556 // BFloat16 implementation.
2557 
2559 #ifdef EIGEN_VECTORIZE_AVX2
2560  __m256i extend = _mm256_cvtepu16_epi32(a);
2561  return _mm256_castsi256_ps(_mm256_slli_epi32(extend, 16));
2562 #else
2563  __m128i lo = _mm_cvtepu16_epi32(a);
2564  __m128i hi = _mm_cvtepu16_epi32(_mm_srli_si128(a, 8));
2565  __m128i lo_shift = _mm_slli_epi32(lo, 16);
2566  __m128i hi_shift = _mm_slli_epi32(hi, 16);
2567  return _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(lo_shift), hi_shift, 1));
2568 #endif
2569 }
2570 
2571 // Convert float to bfloat16 according to round-to-nearest-even/denormals algorithm.
2573  __m256i input = _mm256_castps_si256(a);
2574 
2575 #ifdef EIGEN_VECTORIZE_AVX2
2576  // uint32_t lsb = (input >> 16);
2577  __m256i t = _mm256_srli_epi32(input, 16);
2578  // uint32_t lsb = lsb & 1;
2579  t = _mm256_and_si256(t, _mm256_set1_epi32(1));
2580  // uint32_t rounding_bias = 0x7fff + lsb;
2581  t = _mm256_add_epi32(t, _mm256_set1_epi32(0x7fff));
2582  // input += rounding_bias;
2583  t = _mm256_add_epi32(t, input);
2584  // input = input >> 16;
2585  t = _mm256_srli_epi32(t, 16);
2586  // Check NaN before converting back to bf16
2587  __m256 mask = _mm256_cmp_ps(a, a, _CMP_ORD_Q);
2588  __m256i nan = _mm256_set1_epi32(0x7fc0);
2589  t = _mm256_blendv_epi8(nan, t, _mm256_castps_si256(mask));
2590  // output = numext::bit_cast<uint16_t>(input);
2591  return _mm_packus_epi32(_mm256_extractf128_si256(t, 0), _mm256_extractf128_si256(t, 1));
2592 #else
2593  // uint32_t lsb = (input >> 16);
2594  __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(input, 0), 16);
2595  __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(input, 1), 16);
2596  // uint32_t lsb = lsb & 1;
2597  lo = _mm_and_si128(lo, _mm_set1_epi32(1));
2598  hi = _mm_and_si128(hi, _mm_set1_epi32(1));
2599  // uint32_t rounding_bias = 0x7fff + lsb;
2600  lo = _mm_add_epi32(lo, _mm_set1_epi32(0x7fff));
2601  hi = _mm_add_epi32(hi, _mm_set1_epi32(0x7fff));
2602  // input += rounding_bias;
2603  lo = _mm_add_epi32(lo, _mm256_extractf128_si256(input, 0));
2604  hi = _mm_add_epi32(hi, _mm256_extractf128_si256(input, 1));
2605  // input = input >> 16;
2606  lo = _mm_srli_epi32(lo, 16);
2607  hi = _mm_srli_epi32(hi, 16);
2608  // Check NaN before converting back to bf16
2609  __m256 mask = _mm256_cmp_ps(a, a, _CMP_ORD_Q);
2610  __m128i nan = _mm_set1_epi32(0x7fc0);
2611  lo = _mm_blendv_epi8(nan, lo, _mm_castps_si128(_mm256_castps256_ps128(mask)));
2612  hi = _mm_blendv_epi8(nan, hi, _mm_castps_si128(_mm256_extractf128_ps(mask, 1)));
2613  // output = numext::bit_cast<uint16_t>(input);
2614  return _mm_packus_epi32(lo, hi);
2615 #endif
2616 }
2617 
2618 template <>
2620  return _mm_set1_epi16(numext::bit_cast<numext::uint16_t>(from));
2621 }
2622 
2623 template <>
2625  return numext::bit_cast<bfloat16>(static_cast<numext::uint16_t>(_mm_extract_epi16(from, 0)));
2626 }
2627 
2628 template <>
2630  return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
2631 }
2632 
2633 template <>
2634 EIGEN_STRONG_INLINE Packet8bf ploadu<Packet8bf>(const bfloat16* from) {
2635  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
2636 }
2637 
2638 template <>
2639 EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet8bf& from) {
2640  _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
2641 }
2642 
2643 template <>
2644 EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet8bf& from) {
2645  _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
2646 }
2647 
2648 template <>
2649 EIGEN_STRONG_INLINE Packet8bf ploaddup<Packet8bf>(const bfloat16* from) {
2650  const numext::uint16_t a = numext::bit_cast<numext::uint16_t>(from[0]);
2651  const numext::uint16_t b = numext::bit_cast<numext::uint16_t>(from[1]);
2652  const numext::uint16_t c = numext::bit_cast<numext::uint16_t>(from[2]);
2653  const numext::uint16_t d = numext::bit_cast<numext::uint16_t>(from[3]);
2654  return _mm_set_epi16(d, d, c, c, b, b, a, a);
2655 }
2656 
2657 template <>
2658 EIGEN_STRONG_INLINE Packet8bf ploadquad<Packet8bf>(const bfloat16* from) {
2659  const numext::uint16_t a = numext::bit_cast<numext::uint16_t>(from[0]);
2660  const numext::uint16_t b = numext::bit_cast<numext::uint16_t>(from[1]);
2661  return _mm_set_epi16(b, b, b, b, a, a, a, a);
2662 }
2663 
2664 template <>
2666  return _mm_cmpeq_epi32(a, a);
2667 }
2668 
2669 template <>
2671  const __m128i sign_mask = _mm_set1_epi16(static_cast<numext::uint16_t>(0x8000));
2672  return _mm_andnot_si128(sign_mask, a);
2673 }
2674 
2675 template <>
2678 }
2679 
2680 template <>
2683 }
2684 
2685 template <>
2687  return F32ToBf16(plset<Packet8f>(static_cast<float>(a)));
2688 }
2689 
2690 template <>
2692  return _mm_or_si128(a, b);
2693 }
2694 template <>
2696  return _mm_xor_si128(a, b);
2697 }
2698 template <>
2700  return _mm_and_si128(a, b);
2701 }
2702 template <>
2704  return _mm_andnot_si128(b, a);
2705 }
2706 
2707 template <>
2709  return _mm_blendv_epi8(b, a, mask);
2710 }
2711 
2712 template <>
2715 }
2716 
2717 template <>
2720 }
2721 
2722 template <>
2725 }
2726 
2727 template <>
2730 }
2731 
2732 template <>
2735 }
2736 
2737 template <>
2739  return Pack16To8(pcmp_eq(Bf16ToF32(a), Bf16ToF32(b)));
2740 }
2741 
2742 template <>
2744  return Pack16To8(pcmp_le(Bf16ToF32(a), Bf16ToF32(b)));
2745 }
2746 
2747 template <>
2749  return Pack16To8(pcmp_lt(Bf16ToF32(a), Bf16ToF32(b)));
2750 }
2751 
2752 template <>
2755 }
2756 
2757 template <>
2759  return a;
2760 }
2761 
2762 template <>
2764  Packet8bf sign_mask = _mm_set1_epi16(static_cast<numext::uint16_t>(0x8000));
2765  return _mm_xor_si128(a, sign_mask);
2766 }
2767 
2768 template <>
2771 }
2772 
2773 template <>
2776 }
2777 
2778 template <>
2781 }
2782 
2783 template <>
2786 }
2787 
2788 template <>
2789 EIGEN_STRONG_INLINE Packet8bf pgather<bfloat16, Packet8bf>(const bfloat16* from, Index stride) {
2790  const numext::uint16_t s0 = numext::bit_cast<numext::uint16_t>(from[0 * stride]);
2791  const numext::uint16_t s1 = numext::bit_cast<numext::uint16_t>(from[1 * stride]);
2792  const numext::uint16_t s2 = numext::bit_cast<numext::uint16_t>(from[2 * stride]);
2793  const numext::uint16_t s3 = numext::bit_cast<numext::uint16_t>(from[3 * stride]);
2794  const numext::uint16_t s4 = numext::bit_cast<numext::uint16_t>(from[4 * stride]);
2795  const numext::uint16_t s5 = numext::bit_cast<numext::uint16_t>(from[5 * stride]);
2796  const numext::uint16_t s6 = numext::bit_cast<numext::uint16_t>(from[6 * stride]);
2797  const numext::uint16_t s7 = numext::bit_cast<numext::uint16_t>(from[7 * stride]);
2798  return _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0);
2799 }
2800 
2801 template <>
2802 EIGEN_STRONG_INLINE void pscatter<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from, Index stride) {
2803  EIGEN_ALIGN32 bfloat16 aux[8];
2804  pstore(aux, from);
2805  to[stride * 0] = aux[0];
2806  to[stride * 1] = aux[1];
2807  to[stride * 2] = aux[2];
2808  to[stride * 3] = aux[3];
2809  to[stride * 4] = aux[4];
2810  to[stride * 5] = aux[5];
2811  to[stride * 6] = aux[6];
2812  to[stride * 7] = aux[7];
2813 }
2814 
2815 template <>
2817  return static_cast<bfloat16>(predux<Packet8f>(Bf16ToF32(a)));
2818 }
2819 
2820 template <>
2822  return static_cast<bfloat16>(predux_max<Packet8f>(Bf16ToF32(a)));
2823 }
2824 
2825 template <>
2827  return static_cast<bfloat16>(predux_min<Packet8f>(Bf16ToF32(a)));
2828 }
2829 
2830 template <>
2832  return static_cast<bfloat16>(predux_mul<Packet8f>(Bf16ToF32(a)));
2833 }
2834 
2835 template <>
2837  __m128i m = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
2838  return _mm_shuffle_epi8(a, m);
2839 }
2840 
2841 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8bf, 8>& kernel) {
2842  __m128i a = kernel.packet[0];
2843  __m128i b = kernel.packet[1];
2844  __m128i c = kernel.packet[2];
2845  __m128i d = kernel.packet[3];
2846  __m128i e = kernel.packet[4];
2847  __m128i f = kernel.packet[5];
2848  __m128i g = kernel.packet[6];
2849  __m128i h = kernel.packet[7];
2850 
2851  __m128i a03b03 = _mm_unpacklo_epi16(a, b);
2852  __m128i c03d03 = _mm_unpacklo_epi16(c, d);
2853  __m128i e03f03 = _mm_unpacklo_epi16(e, f);
2854  __m128i g03h03 = _mm_unpacklo_epi16(g, h);
2855  __m128i a47b47 = _mm_unpackhi_epi16(a, b);
2856  __m128i c47d47 = _mm_unpackhi_epi16(c, d);
2857  __m128i e47f47 = _mm_unpackhi_epi16(e, f);
2858  __m128i g47h47 = _mm_unpackhi_epi16(g, h);
2859 
2860  __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03);
2861  __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03);
2862  __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03);
2863  __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03);
2864  __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47);
2865  __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47);
2866  __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47);
2867  __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47);
2868 
2869  kernel.packet[0] = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01);
2870  kernel.packet[1] = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01);
2871  kernel.packet[2] = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23);
2872  kernel.packet[3] = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23);
2873  kernel.packet[4] = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45);
2874  kernel.packet[5] = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45);
2875  kernel.packet[6] = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67);
2876  kernel.packet[7] = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67);
2877 }
2878 
2879 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8bf, 4>& kernel) {
2880  __m128i a = kernel.packet[0];
2881  __m128i b = kernel.packet[1];
2882  __m128i c = kernel.packet[2];
2883  __m128i d = kernel.packet[3];
2884 
2885  __m128i ab_03 = _mm_unpacklo_epi16(a, b);
2886  __m128i cd_03 = _mm_unpacklo_epi16(c, d);
2887  __m128i ab_47 = _mm_unpackhi_epi16(a, b);
2888  __m128i cd_47 = _mm_unpackhi_epi16(c, d);
2889 
2890  kernel.packet[0] = _mm_unpacklo_epi32(ab_03, cd_03);
2891  kernel.packet[1] = _mm_unpackhi_epi32(ab_03, cd_03);
2892  kernel.packet[2] = _mm_unpacklo_epi32(ab_47, cd_47);
2893  kernel.packet[3] = _mm_unpackhi_epi32(ab_47, cd_47);
2894 }
2895 
2896 } // end namespace internal
2897 
2898 } // end namespace Eigen
2899 
2900 #endif // EIGEN_PACKET_MATH_AVX_H
#define MM256_UNPACKLO_EPI32(A, B)
Definition: AVX/PacketMath.h:2104
#define MM256_SHUFFLE_EPI32(A, B, M)
Definition: AVX/PacketMath.h:2100
#define MM256_UNPACKHI_EPI32(A, B)
Definition: AVX/PacketMath.h:2106
int i
Definition: BiCGSTAB_step_by_step.cpp:9
#define EIGEN_ALIGN32
Definition: ConfigureVectorization.h:143
Array< double, 1, 3 > e(1./3., 0.5, 2.)
#define EIGEN_DEBUG_ALIGNED_STORE
Definition: GenericPacketMath.h:38
#define EIGEN_DEBUG_ALIGNED_LOAD
Definition: GenericPacketMath.h:30
#define EIGEN_DEBUG_UNALIGNED_STORE
Definition: GenericPacketMath.h:42
#define EIGEN_DEBUG_UNALIGNED_LOAD
Definition: GenericPacketMath.h:34
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:892
#define EIGEN_FAST_MATH
Definition: Macros.h:51
#define EIGEN_STRONG_INLINE
Definition: Macros.h:834
cout<< "Here is the matrix m:"<< endl<< m<< endl;Matrix< ptrdiff_t, 3, 1 > res
Definition: PartialRedux_count.cpp:3
#define vec4i_swizzle1(v, p, q, r, s)
Definition: SSE/PacketMath.h:98
Scalar * b
Definition: benchVecAdd.cpp:17
@ N
Definition: constructor.cpp:22
static int f(const TensorMap< Tensor< int, 3 > > &tensor)
Definition: cxx11_tensor_map.cpp:237
@ Aligned32
Definition: Constants.h:238
@ Aligned16
Definition: Constants.h:237
return int(ret)+1
const Scalar * a
Definition: level2_cplx_impl.h:32
const char const int const RealScalar const RealScalar * pa
Definition: level2_cplx_impl.h:20
int * m
Definition: level2_cplx_impl.h:294
Eigen::Matrix< Scalar, Dynamic, Dynamic, ColMajor > tmp
Definition: level3_impl.h:365
EIGEN_STRONG_INLINE Packet8bf ptrunc< Packet8bf >(const Packet8bf &a)
Definition: AltiVec/PacketMath.h:2368
EIGEN_STRONG_INLINE Packet8i ploadu< Packet8i >(const int *from)
Definition: AVX/PacketMath.h:1515
EIGEN_STRONG_INLINE Packet4d print< Packet4d >(const Packet4d &a)
Definition: AVX/PacketMath.h:1237
EIGEN_STRONG_INLINE Packet8f pmax< PropagateNumbers, Packet8f >(const Packet8f &a, const Packet8f &b)
Definition: AVX/PacketMath.h:1208
__m128d Packet2d
Definition: LSX/PacketMath.h:36
EIGEN_STRONG_INLINE void pstoreu< double >(double *to, const Packet4d &from)
Definition: AVX/PacketMath.h:1628
EIGEN_STRONG_INLINE Packet pminmax_propagate_numbers(const Packet &a, const Packet &b, Op op)
Definition: SSE/PacketMath.h:1118
EIGEN_STRONG_INLINE Packet8ui pandnot< Packet8ui >(const Packet8ui &a, const Packet8ui &b)
Definition: AVX/PacketMath.h:1393
EIGEN_STRONG_INLINE void pstoreu< uint32_t >(uint32_t *to, const Packet8ui &from)
Definition: AVX/PacketMath.h:1636
EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf &a)
Definition: AltiVec/Complex.h:268
eigen_packet_wrapper< __m128i, 3 > Packet2l
Definition: LSX/PacketMath.h:41
EIGEN_STRONG_INLINE Packet8bf print< Packet8bf >(const Packet8bf &a)
Definition: AVX/PacketMath.h:2718
EIGEN_STRONG_INLINE Packet8i ploadquad< Packet8i >(const int *from)
Definition: AVX/PacketMath.h:1598
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8bf pgather< bfloat16, Packet8bf >(const bfloat16 *from, Index stride)
Definition: AltiVec/PacketMath.h:874
EIGEN_STRONG_INLINE Packet8h pmax< Packet8h >(const Packet8h &a, const Packet8h &b)
Definition: AVX/PacketMath.h:2299
EIGEN_STRONG_INLINE void prefetch< uint32_t >(const uint32_t *addr)
Definition: AVX/PacketMath.h:1758
EIGEN_DEVICE_FUNC Packet padd(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:318
EIGEN_STRONG_INLINE Packet8i psub< Packet8i >(const Packet8i &a, const Packet8i &b)
Definition: AVX/PacketMath.h:878
EIGEN_STRONG_INLINE Packet8h pmul< Packet8h >(const Packet8h &a, const Packet8h &b)
Definition: AVX/PacketMath.h:2406
EIGEN_STRONG_INLINE Packet8f pmin< PropagateNumbers, Packet8f >(const Packet8f &a, const Packet8f &b)
Definition: AVX/PacketMath.h:1200
EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f &a)
Definition: AVX/PacketMath.h:2283
EIGEN_STRONG_INLINE Packet8f Bf16ToF32(const Packet8bf &a)
Definition: AVX/PacketMath.h:2558
EIGEN_STRONG_INLINE Packet8f pzero(const Packet8f &)
Definition: AVX/PacketMath.h:774
__vector int Packet4i
Definition: AltiVec/PacketMath.h:34
EIGEN_STRONG_INLINE Packet8f pfloor< Packet8f >(const Packet8f &a)
Definition: AVX/PacketMath.h:1251
EIGEN_ALWAYS_INLINE int64_t _mm_extract_epi64_0(const __m128i &a)
Definition: SSE/PacketMath.h:161
EIGEN_STRONG_INLINE Packet8f pisnan(const Packet8f &a)
Definition: AVX/PacketMath.h:1034
EIGEN_STRONG_INLINE Packet8f pmax< Packet8f >(const Packet8f &a, const Packet8f &b)
Definition: AVX/PacketMath.h:1147
EIGEN_STRONG_INLINE Packet4d pset1< Packet4d >(const double &from)
Definition: AVX/PacketMath.h:752
EIGEN_STRONG_INLINE Packet8f pmax< PropagateNaN, Packet8f >(const Packet8f &a, const Packet8f &b)
Definition: AVX/PacketMath.h:1224
EIGEN_STRONG_INLINE Packet8i padd< Packet8i >(const Packet8i &a, const Packet8i &b)
Definition: AVX/PacketMath.h:832
EIGEN_STRONG_INLINE void pstore1< Packet8i >(int *to, const int &a)
Definition: AVX/PacketMath.h:1739
EIGEN_STRONG_INLINE Packet4d pfrexp< Packet4d >(const Packet4d &a, Packet4d &exponent)
Definition: AVX/PacketMath.h:1901
EIGEN_STRONG_INLINE bfloat16 predux_mul< Packet8bf >(const Packet8bf &a)
Definition: AltiVec/PacketMath.h:2558
EIGEN_STRONG_INLINE Packet8bf pmin< Packet8bf >(const Packet8bf &a, const Packet8bf &b)
Definition: AltiVec/PacketMath.h:2391
EIGEN_STRONG_INLINE float predux_mul< Packet8f >(const Packet8f &a)
Definition: AVX/PacketMath.h:1984
EIGEN_STRONG_INLINE Packet8i pset1< Packet8i >(const int &from)
Definition: AVX/PacketMath.h:756
EIGEN_STRONG_INLINE Packet4d ptrunc< Packet4d >(const Packet4d &a)
Definition: AVX/PacketMath.h:1264
EIGEN_STRONG_INLINE __m128i Pack16To8(Packet8f rf)
Definition: AVX/PacketMath.h:402
EIGEN_STRONG_INLINE Packet4d pldexp_fast< Packet4d >(const Packet4d &a, const Packet4d &exponent)
Definition: AVX/PacketMath.h:1938
EIGEN_STRONG_INLINE Packet4i pset1< Packet4i >(const int &from)
Definition: AltiVec/PacketMath.h:778
EIGEN_STRONG_INLINE Packet8bf pceil< Packet8bf >(const Packet8bf &a)
Definition: AltiVec/PacketMath.h:2360
EIGEN_STRONG_INLINE Packet8bf pdiv< Packet8bf >(const Packet8bf &a, const Packet8bf &b)
Definition: AltiVec/PacketMath.h:2293
EIGEN_STRONG_INLINE Packet8f pselect< Packet8f >(const Packet8f &mask, const Packet8f &a, const Packet8f &b)
Definition: AVX/PacketMath.h:1424
EIGEN_STRONG_INLINE Packet8bf pround< Packet8bf >(const Packet8bf &a)
Definition: AltiVec/PacketMath.h:2364
EIGEN_STRONG_INLINE void pstore1< Packet8f >(float *to, const float &a)
Definition: AVX/PacketMath.h:1729
EIGEN_STRONG_INLINE Packet8h print< Packet8h >(const Packet8h &a)
Definition: AVX/PacketMath.h:2338
EIGEN_STRONG_INLINE Packet8h pset1< Packet8h >(const Eigen::half &from)
Definition: AVX/PacketMath.h:2217
EIGEN_STRONG_INLINE Packet8h psub< Packet8h >(const Packet8h &a, const Packet8h &b)
Definition: AVX/PacketMath.h:2398
EIGEN_STRONG_INLINE Packet8f pfrexp< Packet8f >(const Packet8f &a, Packet8f &exponent)
Definition: AVX/PacketMath.h:1874
EIGEN_STRONG_INLINE Packet8f pxor< Packet8f >(const Packet8f &a, const Packet8f &b)
Definition: AVX/PacketMath.h:1352
EIGEN_STRONG_INLINE void ptranspose(PacketBlock< Packet2cf, 2 > &kernel)
Definition: AltiVec/Complex.h:339
EIGEN_STRONG_INLINE Packet4d pceil< Packet4d >(const Packet4d &a)
Definition: AVX/PacketMath.h:1246
EIGEN_STRONG_INLINE Packet8h pfloor< Packet8h >(const Packet8h &a)
Definition: AVX/PacketMath.h:2348
EIGEN_STRONG_INLINE float predux_min< Packet8f >(const Packet8f &a)
Definition: AVX/PacketMath.h:1998
EIGEN_STRONG_INLINE bool predux_any(const Packet4f &x)
Definition: AltiVec/PacketMath.h:2751
EIGEN_STRONG_INLINE int pfirst< Packet8i >(const Packet8i &a)
Definition: AVX/PacketMath.h:1772
EIGEN_STRONG_INLINE Packet8i por< Packet8i >(const Packet8i &a, const Packet8i &b)
Definition: AVX/PacketMath.h:1335
EIGEN_DEVICE_FUNC Packet8ui pgather< uint32_t, Packet8ui >(const uint32_t *from, Index stride)
Definition: AVX/PacketMath.h:1682
EIGEN_STRONG_INLINE Packet8f pset1frombits< Packet8f >(unsigned int from)
Definition: AVX/PacketMath.h:765
EIGEN_STRONG_INLINE Packet8f psub< Packet8f >(const Packet8f &a, const Packet8f &b)
Definition: AVX/PacketMath.h:870
EIGEN_DEVICE_FUNC Packet pdiv(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:368
EIGEN_ALWAYS_INLINE int64_t _mm_extract_epi64_1(const __m128i &a)
Definition: SSE/PacketMath.h:164
EIGEN_STRONG_INLINE void pstore< bfloat16 >(bfloat16 *to, const Packet8bf &from)
Definition: AltiVec/PacketMath.h:662
EIGEN_STRONG_INLINE Packet4d pldexp< Packet4d >(const Packet4d &a, const Packet4d &exponent)
Definition: AVX/PacketMath.h:1911
EIGEN_STRONG_INLINE Packet4i pdiv< Packet4i >(const Packet4i &a, const Packet4i &b)
Definition: AltiVec/PacketMath.h:1205
EIGEN_STRONG_INLINE Packet4f predux_half_dowto4< Packet8f >(const Packet8f &a)
Definition: AVX/PacketMath.h:1971
EIGEN_STRONG_INLINE Packet8f pmin< Packet8f >(const Packet8f &a, const Packet8f &b)
Definition: AVX/PacketMath.h:1099
EIGEN_DEVICE_FUNC Packet8i pgather< int, Packet8i >(const int *from, Index stride)
Definition: AVX/PacketMath.h:1677
EIGEN_STRONG_INLINE Packet4d pmin< PropagateNumbers, Packet4d >(const Packet4d &a, const Packet4d &b)
Definition: AVX/PacketMath.h:1204
EIGEN_STRONG_INLINE Packet8f padd< Packet8f >(const Packet8f &a, const Packet8f &b)
Definition: AVX/PacketMath.h:817
EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i &a)
Definition: AltiVec/PacketMath.h:1983
EIGEN_STRONG_INLINE Eigen::half predux< Packet8h >(const Packet8h &a)
Definition: AVX/PacketMath.h:2451
EIGEN_STRONG_INLINE Packet8ui pset1< Packet8ui >(const uint32_t &from)
Definition: AVX/PacketMath.h:760
EIGEN_DEVICE_FUNC Packet pmax(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:663
EIGEN_STRONG_INLINE Packet4i pblend(const Selector< 4 > &ifPacket, const Packet4i &thenPacket, const Packet4i &elsePacket)
Definition: AltiVec/PacketMath.h:3075
EIGEN_STRONG_INLINE double predux< Packet4d >(const Packet4d &a)
Definition: AVX/PacketMath.h:1958
EIGEN_STRONG_INLINE Packet8bf padd< Packet8bf >(const Packet8bf &a, const Packet8bf &b)
Definition: AltiVec/PacketMath.h:2283
EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f &a, const Packet4f &b)
Definition: AltiVec/PacketMath.h:1314
EIGEN_STRONG_INLINE Packet8h plset< Packet8h >(const half &a)
Definition: AVX/PacketMath.h:2304
EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i &a)
Definition: AltiVec/PacketMath.h:1979
EIGEN_STRONG_INLINE Packet4i predux_half_dowto4< Packet8i >(const Packet8i &a)
Definition: AVX/PacketMath.h:1975
EIGEN_STRONG_INLINE Packet4d pdiv< Packet4d >(const Packet4d &a, const Packet4d &b)
Definition: AVX/PacketMath.h:960
EIGEN_STRONG_INLINE Packet pminmax_propagate_nan(const Packet &a, const Packet &b, Op op)
Definition: SSE/PacketMath.h:1127
EIGEN_STRONG_INLINE void pstore1< Packet4d >(double *to, const double &a)
Definition: AVX/PacketMath.h:1734
EIGEN_STRONG_INLINE Packet4ui predux_half_dowto4< Packet8ui >(const Packet8ui &a)
Definition: AVX/PacketMath.h:1979
EIGEN_STRONG_INLINE Packet8h pdiv< Packet8h >(const Packet8h &a, const Packet8h &b)
Definition: AVX/PacketMath.h:2414
EIGEN_STRONG_INLINE void pstore< int >(int *to, const Packet4i &from)
Definition: AltiVec/PacketMath.h:647
EIGEN_STRONG_INLINE Packet8bf plset< Packet8bf >(const bfloat16 &a)
Definition: AltiVec/PacketMath.h:2428
EIGEN_STRONG_INLINE Packet8h por(const Packet8h &a, const Packet8h &b)
Definition: AVX/PacketMath.h:2309
EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i &a, const Packet4i &b)
Definition: AltiVec/PacketMath.h:1341
EIGEN_STRONG_INLINE Packet8ui pload< Packet8ui >(const uint32_t *from)
Definition: AVX/PacketMath.h:1502
EIGEN_STRONG_INLINE Packet8bf pmul< Packet8bf >(const Packet8bf &a, const Packet8bf &b)
Definition: AltiVec/PacketMath.h:2288
EIGEN_STRONG_INLINE Packet8bf ploaddup< Packet8bf >(const bfloat16 *from)
Definition: AltiVec/PacketMath.h:2423
EIGEN_STRONG_INLINE Packet8f ploaddup< Packet8f >(const float *from)
Definition: AVX/PacketMath.h:1540
EIGEN_STRONG_INLINE Packet8f pandnot< Packet8f >(const Packet8f &a, const Packet8f &b)
Definition: AVX/PacketMath.h:1377
__vector unsigned int Packet4ui
Definition: AltiVec/PacketMath.h:35
EIGEN_STRONG_INLINE float predux< Packet8f >(const Packet8f &a)
Definition: AVX/PacketMath.h:1954
EIGEN_STRONG_INLINE Packet8f plset< Packet8f >(const float &a)
Definition: AVX/PacketMath.h:853
EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf &a)
Definition: AltiVec/Complex.h:303
EIGEN_STRONG_INLINE Packet8i pdiv< Packet8i >(const Packet8i &a, const Packet8i &b)
Definition: AVX/PacketMath.h:965
EIGEN_STRONG_INLINE Packet4d pmax< Packet4d >(const Packet4d &a, const Packet4d &b)
Definition: AVX/PacketMath.h:1159
EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h &a)
Definition: AVX/PacketMath.h:2273
EIGEN_STRONG_INLINE void pstore< double >(double *to, const Packet4d &from)
Definition: AVX/PacketMath.h:1611
EIGEN_STRONG_INLINE Packet8f ploadu< Packet8f >(const float *from)
Definition: AVX/PacketMath.h:1507
EIGEN_STRONG_INLINE Packet8ui psub< Packet8ui >(const Packet8ui &a, const Packet8ui &b)
Definition: AVX/PacketMath.h:888
EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
Definition: AltiVec/PacketMath.h:1218
EIGEN_STRONG_INLINE double predux_min< Packet4d >(const Packet4d &a)
Definition: AVX/PacketMath.h:2004
EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf &a, const Packet4cf &b)
Definition: AVX/Complex.h:88
EIGEN_STRONG_INLINE Packet8ui ploaddup< Packet8ui >(const uint32_t *from)
Definition: AVX/PacketMath.h:1576
EIGEN_STRONG_INLINE Packet8h ptrue(const Packet8h &a)
Definition: AVX/PacketMath.h:2263
EIGEN_STRONG_INLINE Packet4d pmin< PropagateNaN, Packet4d >(const Packet4d &a, const Packet4d &b)
Definition: AVX/PacketMath.h:1220
EIGEN_DEVICE_FUNC Packet pmin(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:649
EIGEN_STRONG_INLINE Packet8h ptrunc< Packet8h >(const Packet8h &a)
Definition: AVX/PacketMath.h:2353
EIGEN_STRONG_INLINE Eigen::half predux_min< Packet8h >(const Packet8h &a)
Definition: AVX/PacketMath.h:2466
EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h &a, const Packet8h &b)
Definition: AVX/PacketMath.h:2323
EIGEN_STRONG_INLINE double predux_max< Packet4d >(const Packet4d &a)
Definition: AVX/PacketMath.h:2017
EIGEN_STRONG_INLINE Packet8f pload< Packet8f >(const float *from)
Definition: AVX/PacketMath.h:1490
eigen_packet_wrapper< __vector unsigned short int, 0 > Packet8bf
Definition: AltiVec/PacketMath.h:42
EIGEN_STRONG_INLINE Packet4d ptrue< Packet4d >(const Packet4d &a)
Definition: AVX/PacketMath.h:1291
EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf &a)
Definition: AltiVec/Complex.h:264
EIGEN_STRONG_INLINE void prefetch< float >(const float *addr)
Definition: AltiVec/PacketMath.h:1854
EIGEN_STRONG_INLINE Packet4d ploadu< Packet4d >(const double *from)
Definition: AVX/PacketMath.h:1511
EIGEN_STRONG_INLINE void pstoreu< bfloat16 >(bfloat16 *to, const Packet8bf &from)
Definition: AltiVec/PacketMath.h:1772
EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i &a)
Definition: AltiVec/PacketMath.h:1975
EIGEN_STRONG_INLINE Packet8i pload< Packet8i >(const int *from)
Definition: AVX/PacketMath.h:1498
EIGEN_STRONG_INLINE Packet4d pfrexp_generic_get_biased_exponent(const Packet4d &a)
Definition: AVX/PacketMath.h:1880
EIGEN_STRONG_INLINE Packet8f ploadquad< Packet8f >(const float *from)
Definition: AVX/PacketMath.h:1593
EIGEN_STRONG_INLINE Packet4d pmul< Packet4d >(const Packet4d &a, const Packet4d &b)
Definition: AVX/PacketMath.h:931
EIGEN_STRONG_INLINE uint32_t pfirst< Packet8ui >(const Packet8ui &a)
Definition: AVX/PacketMath.h:1776
EIGEN_STRONG_INLINE Packet8i pxor< Packet8i >(const Packet8i &a, const Packet8i &b)
Definition: AVX/PacketMath.h:1360
EIGEN_STRONG_INLINE __m256i avx_blend_mask(const Selector< 4 > &ifPacket)
Definition: AVX/PacketMath.h:2175
EIGEN_STRONG_INLINE Packet4i ploadu< Packet4i >(const int *from)
Definition: AltiVec/PacketMath.h:1537
EIGEN_STRONG_INLINE Packet8bf psignbit(const Packet8bf &a)
Definition: AltiVec/PacketMath.h:1966
EIGEN_STRONG_INLINE Packet8i plset< Packet8i >(const int &a)
Definition: AVX/PacketMath.h:861
EIGEN_STRONG_INLINE Packet4d ploaddup< Packet4d >(const double *from)
Definition: AVX/PacketMath.h:1556
EIGEN_STRONG_INLINE Packet8f pceil< Packet8f >(const Packet8f &a)
Definition: AVX/PacketMath.h:1242
EIGEN_STRONG_INLINE Packet8ui ploadu< Packet8ui >(const uint32_t *from)
Definition: AVX/PacketMath.h:1519
EIGEN_STRONG_INLINE Packet8f por< Packet8f >(const Packet8f &a, const Packet8f &b)
Definition: AVX/PacketMath.h:1327
EIGEN_STRONG_INLINE float pfirst< Packet8f >(const Packet8f &a)
Definition: AVX/PacketMath.h:1764
EIGEN_STRONG_INLINE Packet8h pceil< Packet8h >(const Packet8h &a)
Definition: AVX/PacketMath.h:2343
EIGEN_STRONG_INLINE float predux_max< Packet8f >(const Packet8f &a)
Definition: AVX/PacketMath.h:2010
EIGEN_STRONG_INLINE Packet8f pand< Packet8f >(const Packet8f &a, const Packet8f &b)
Definition: AVX/PacketMath.h:1302
EIGEN_STRONG_INLINE Packet8bf pfloor< Packet8bf >(const Packet8bf &a)
Definition: AltiVec/PacketMath.h:2356
const char * SsePrefetchPtrType
Definition: SSE/PacketMath.h:1719
EIGEN_STRONG_INLINE void pstore< float >(float *to, const Packet4f &from)
Definition: AltiVec/PacketMath.h:642
EIGEN_STRONG_INLINE Packet4d pselect< Packet4d >(const Packet4d &mask, const Packet4d &a, const Packet4d &b)
Definition: AVX/PacketMath.h:1439
EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f &a)
Definition: AltiVec/PacketMath.h:1936
EIGEN_STRONG_INLINE Packet8ui pand< Packet8ui >(const Packet8ui &a, const Packet8ui &b)
Definition: AVX/PacketMath.h:1318
EIGEN_STRONG_INLINE Packet8f peven_mask(const Packet8f &)
Definition: AVX/PacketMath.h:791
EIGEN_STRONG_INLINE Packet8i pandnot< Packet8i >(const Packet8i &a, const Packet8i &b)
Definition: AVX/PacketMath.h:1385
EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet8bf &a)
Definition: AltiVec/PacketMath.h:2418
EIGEN_STRONG_INLINE Packet8ui pmin< Packet8ui >(const Packet8ui &a, const Packet8ui &b)
Definition: AVX/PacketMath.h:1136
EIGEN_STRONG_INLINE int predux< Packet8i >(const Packet8i &a)
Definition: AVX/PacketMath.h:1962
eigen_packet_wrapper< __m256i, 0 > Packet8i
Definition: AVX/PacketMath.h:35
EIGEN_STRONG_INLINE Packet4d pround< Packet4d >(const Packet4d &a)
Definition: AVX/PacketMath.h:1417
EIGEN_STRONG_INLINE Packet8f ptrunc< Packet8f >(const Packet8f &a)
Definition: AVX/PacketMath.h:1260
EIGEN_DEVICE_FUNC Packet4d pgather< double, Packet4d >(const double *from, Index stride)
Definition: AVX/PacketMath.h:1673
EIGEN_DEVICE_FUNC void pstore(Scalar *to, const Packet &from)
Definition: GenericPacketMath.h:891
EIGEN_STRONG_INLINE Eigen::half predux_max< Packet8h >(const Packet8h &a)
Definition: AVX/PacketMath.h:2459
EIGEN_STRONG_INLINE uint32_t predux< Packet8ui >(const Packet8ui &a)
Definition: AVX/PacketMath.h:1966
EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f &a, const Packet4f &b, const Packet4f &c)
Definition: LSX/PacketMath.h:835
EIGEN_STRONG_INLINE Packet4d plset< Packet4d >(const double &a)
Definition: AVX/PacketMath.h:857
EIGEN_DEVICE_FUNC unpacket_traits< Packet >::type predux(const Packet &a)
Definition: GenericPacketMath.h:1232
EIGEN_DEVICE_FUNC void pscatter< double, Packet4d >(double *to, const Packet4d &from, Index stride)
Definition: AVX/PacketMath.h:1701
EIGEN_STRONG_INLINE Packet8bf psub< Packet8bf >(const Packet8bf &a, const Packet8bf &b)
Definition: AltiVec/PacketMath.h:2304
EIGEN_STRONG_INLINE bfloat16 predux< Packet8bf >(const Packet8bf &a)
Definition: AltiVec/PacketMath.h:2455
EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf &a, const Packet2cf &b)
Definition: AltiVec/Complex.h:353
EIGEN_STRONG_INLINE Packet4d pmin< Packet4d >(const Packet4d &a, const Packet4d &b)
Definition: AVX/PacketMath.h:1114
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_generic(const Packet &a, const Packet &exponent)
Definition: GenericPacketMathFunctions.h:226
EIGEN_STRONG_INLINE Packet8bf pload< Packet8bf >(const bfloat16 *from)
Definition: AltiVec/PacketMath.h:522
EIGEN_STRONG_INLINE Packet4f pmsub(const Packet4f &a, const Packet4f &b, const Packet4f &c)
Definition: LSX/PacketMath.h:819
EIGEN_DEVICE_FUNC void pscatter< int, Packet8i >(int *to, const Packet8i &from, Index stride)
Definition: AVX/PacketMath.h:1710
EIGEN_STRONG_INLINE void pstoreu< int >(int *to, const Packet4i &from)
Definition: AltiVec/PacketMath.h:1760
EIGEN_STRONG_INLINE Packet8h pand(const Packet8h &a, const Packet8h &b)
Definition: AVX/PacketMath.h:2319
EIGEN_DEVICE_FUNC void pscatter< uint32_t, Packet8ui >(uint32_t *to, const Packet8ui &from, Index stride)
Definition: AVX/PacketMath.h:1724
EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h &a, const Packet8h &b)
Definition: AVX/PacketMath.h:2315
EIGEN_STRONG_INLINE Packet4d pand< Packet4d >(const Packet4d &a, const Packet4d &b)
Definition: AVX/PacketMath.h:1306
eigen_packet_wrapper< __m256i, 4 > Packet8ui
Definition: AVX/PacketMath.h:41
EIGEN_STRONG_INLINE Packet8f pround< Packet8f >(const Packet8f &a)
Definition: AVX/PacketMath.h:1411
EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
Definition: LSX/PacketMath.h:827
EIGEN_STRONG_INLINE Packet8f pmul< Packet8f >(const Packet8f &a, const Packet8f &b)
Definition: AVX/PacketMath.h:927
EIGEN_STRONG_INLINE Packet8f pldexp< Packet8f >(const Packet8f &a, const Packet8f &exponent)
Definition: AVX/PacketMath.h:1906
EIGEN_STRONG_INLINE Packet8ui por< Packet8ui >(const Packet8ui &a, const Packet8ui &b)
Definition: AVX/PacketMath.h:1343
EIGEN_STRONG_INLINE Packet8ui pmul< Packet8ui >(const Packet8ui &a, const Packet8ui &b)
Definition: AVX/PacketMath.h:945
EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f &mask, const Packet4f &a, const Packet4f &b)
Definition: AltiVec/PacketMath.h:1474
EIGEN_STRONG_INLINE Packet8i ploaddup< Packet8i >(const int *from)
Definition: AVX/PacketMath.h:1562
EIGEN_STRONG_INLINE Packet8h pmin< Packet8h >(const Packet8h &a, const Packet8h &b)
Definition: AVX/PacketMath.h:2294
EIGEN_STRONG_INLINE Packet8ui ploadquad< Packet8ui >(const uint32_t *from)
Definition: AVX/PacketMath.h:1602
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic(const Packet &a, Packet &exponent)
Definition: GenericPacketMathFunctions.h:184
EIGEN_STRONG_INLINE Packet4d pmax< PropagateNumbers, Packet4d >(const Packet4d &a, const Packet4d &b)
Definition: AVX/PacketMath.h:1212
EIGEN_DEVICE_FUNC Packet psub(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:337
EIGEN_STRONG_INLINE bfloat16 predux_min< Packet8bf >(const Packet8bf &a)
Definition: AltiVec/PacketMath.h:2609
EIGEN_STRONG_INLINE Packet4d pload< Packet4d >(const double *from)
Definition: AVX/PacketMath.h:1494
EIGEN_STRONG_INLINE Packet8f pload1< Packet8f >(const float *from)
Definition: AVX/PacketMath.h:808
EIGEN_STRONG_INLINE Packet8ui pxor< Packet8ui >(const Packet8ui &a, const Packet8ui &b)
Definition: AVX/PacketMath.h:1368
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter< bfloat16, Packet8bf >(bfloat16 *to, const Packet8bf &from, Index stride)
Definition: AltiVec/PacketMath.h:977
EIGEN_STRONG_INLINE Packet4d pxor< Packet4d >(const Packet4d &a, const Packet4d &b)
Definition: AVX/PacketMath.h:1356
EIGEN_DEVICE_FUNC Packet8f pgather< float, Packet8f >(const float *from, Index stride)
Definition: AVX/PacketMath.h:1668
EIGEN_STRONG_INLINE Packet8i ptrue< Packet8i >(const Packet8i &a)
Definition: AVX/PacketMath.h:1269
EIGEN_STRONG_INLINE Packet8h ploaddup< Packet8h >(const Eigen::half *from)
Definition: AVX/PacketMath.h:2247
EIGEN_STRONG_INLINE Packet8h ploadquad< Packet8h >(const Eigen::half *from)
Definition: AVX/PacketMath.h:2256
EIGEN_STRONG_INLINE Packet8i pselect< Packet8i >(const Packet8i &mask, const Packet8i &a, const Packet8i &b)
Definition: AVX/PacketMath.h:1428
EIGEN_STRONG_INLINE Packet8h pround< Packet8h >(const Packet8h &a)
Definition: AVX/PacketMath.h:2333
EIGEN_STRONG_INLINE Packet8bf ploadquad< Packet8bf >(const bfloat16 *from)
Definition: AltiVec/PacketMath.h:1689
EIGEN_STRONG_INLINE Packet8ui pselect< Packet8ui >(const Packet8ui &mask, const Packet8ui &a, const Packet8ui &b)
Definition: AVX/PacketMath.h:1433
EIGEN_STRONG_INLINE Packet8h ploadu< Packet8h >(const Eigen::half *from)
Definition: AVX/PacketMath.h:2232
EIGEN_STRONG_INLINE double pfirst< Packet4d >(const Packet4d &a)
Definition: AVX/PacketMath.h:1768
EIGEN_STRONG_INLINE Packet8f pmin< PropagateNaN, Packet8f >(const Packet8f &a, const Packet8f &b)
Definition: AVX/PacketMath.h:1216
EIGEN_STRONG_INLINE Packet4d padd< Packet4d >(const Packet4d &a, const Packet4d &b)
Definition: AVX/PacketMath.h:828
EIGEN_STRONG_INLINE Packet8ui plset< Packet8ui >(const uint32_t &a)
Definition: AVX/PacketMath.h:865
EIGEN_STRONG_INLINE Packet8f pdiv< Packet8f >(const Packet8f &a, const Packet8f &b)
Definition: AVX/PacketMath.h:956
EIGEN_STRONG_INLINE Packet8i pmin< Packet8i >(const Packet8i &a, const Packet8i &b)
Definition: AVX/PacketMath.h:1126
__vector float Packet4f
Definition: AltiVec/PacketMath.h:33
EIGEN_STRONG_INLINE Packet8h padd< Packet8h >(const Packet8h &a, const Packet8h &b)
Definition: AVX/PacketMath.h:2390
__m256 Packet8f
Definition: AVX/PacketMath.h:34
EIGEN_STRONG_INLINE Packet4d pset1frombits< Packet4d >(uint64_t from)
Definition: AVX/PacketMath.h:769
EIGEN_STRONG_INLINE double predux_mul< Packet4d >(const Packet4d &a)
Definition: AVX/PacketMath.h:1991
EIGEN_STRONG_INLINE bfloat16 pfirst< Packet8bf >(const Packet8bf &from)
Definition: AVX/PacketMath.h:2624
EIGEN_STRONG_INLINE Eigen::half pfirst< Packet8h >(const Packet8h &from)
Definition: AVX/PacketMath.h:2222
EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f p4f)
Definition: AltiVec/PacketMath.h:2059
EIGEN_STRONG_INLINE void pstoreu< int64_t >(int64_t *to, const Packet8l &from)
Definition: AVX512/PacketMath.h:1123
EIGEN_STRONG_INLINE void pstoreu< float >(float *to, const Packet4f &from)
Definition: AltiVec/PacketMath.h:1756
EIGEN_STRONG_INLINE Packet8f pset1< Packet8f >(const float &from)
Definition: AVX/PacketMath.h:748
EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f &a, const Packet4f &b)
Definition: AltiVec/PacketMath.h:1329
EIGEN_STRONG_INLINE Packet8i pand< Packet8i >(const Packet8i &a, const Packet8i &b)
Definition: AVX/PacketMath.h:1310
EIGEN_STRONG_INLINE Packet4d pfloor< Packet4d >(const Packet4d &a)
Definition: AVX/PacketMath.h:1255
EIGEN_STRONG_INLINE Packet8bf ploadu< Packet8bf >(const bfloat16 *from)
Definition: AltiVec/PacketMath.h:1549
EIGEN_STRONG_INLINE Packet4d por< Packet4d >(const Packet4d &a, const Packet4d &b)
Definition: AVX/PacketMath.h:1331
EIGEN_STRONG_INLINE void pstore< uint64_t >(uint64_t *to, const Packet2ul &from)
Definition: LSX/PacketMath.h:1569
EIGEN_STRONG_INLINE Packet8bf pmax< Packet8bf >(const Packet8bf &a, const Packet8bf &b)
Definition: AltiVec/PacketMath.h:2396
EIGEN_STRONG_INLINE Packet4ui ploadu< Packet4ui >(const uint32_t *from)
Definition: LSX/PacketMath.h:1476
EIGEN_STRONG_INLINE void pstore< uint32_t >(uint32_t *to, const Packet8ui &from)
Definition: AVX/PacketMath.h:1619
EIGEN_DEVICE_FUNC Packet psign(const Packet &a)
Definition: GenericPacketMath.h:1189
EIGEN_STRONG_INLINE Packet8bf pset1< Packet8bf >(const bfloat16 &from)
Definition: AltiVec/PacketMath.h:808
EIGEN_STRONG_INLINE void prefetch< int >(const int *addr)
Definition: AltiVec/PacketMath.h:1858
EIGEN_STRONG_INLINE Packet4d pload1< Packet4d >(const double *from)
Definition: AVX/PacketMath.h:812
__m256d Packet4d
Definition: AVX/PacketMath.h:36
EIGEN_STRONG_INLINE Packet4d pandnot< Packet4d >(const Packet4d &a, const Packet4d &b)
Definition: AVX/PacketMath.h:1381
EIGEN_STRONG_INLINE Packet4d psub< Packet4d >(const Packet4d &a, const Packet4d &b)
Definition: AVX/PacketMath.h:874
EIGEN_STRONG_INLINE Packet8i pmax< Packet8i >(const Packet8i &a, const Packet8i &b)
Definition: AVX/PacketMath.h:1171
EIGEN_STRONG_INLINE Packet8f ptrue< Packet8f >(const Packet8f &a)
Definition: AVX/PacketMath.h:1280
EIGEN_STRONG_INLINE Packet8i pmul< Packet8i >(const Packet8i &a, const Packet8i &b)
Definition: AVX/PacketMath.h:935
EIGEN_STRONG_INLINE bfloat16 predux_max< Packet8bf >(const Packet8bf &a)
Definition: AltiVec/PacketMath.h:2689
EIGEN_STRONG_INLINE void pstoreu< uint64_t >(uint64_t *to, const Packet2ul &from)
Definition: LSX/PacketMath.h:1611
eigen_packet_wrapper< __m128i, 2 > Packet8h
Definition: AVX/PacketMath.h:38
EIGEN_STRONG_INLINE Packet8h pload< Packet8h >(const Eigen::half *from)
Definition: AVX/PacketMath.h:2227
EIGEN_DEVICE_FUNC void pscatter< float, Packet8f >(float *to, const Packet8f &from, Index stride)
Definition: AVX/PacketMath.h:1687
EIGEN_STRONG_INLINE Packet4d pmax< PropagateNaN, Packet4d >(const Packet4d &a, const Packet4d &b)
Definition: AVX/PacketMath.h:1228
EIGEN_STRONG_INLINE void pstore< int64_t >(int64_t *to, const Packet8l &from)
Definition: AVX512/PacketMath.h:1106
EIGEN_STRONG_INLINE Packet8f print< Packet8f >(const Packet8f &a)
Definition: AVX/PacketMath.h:1233
EIGEN_STRONG_INLINE void prefetch< double >(const double *addr)
Definition: AVX/PacketMath.h:1750
EIGEN_STRONG_INLINE Packet8ui pmax< Packet8ui >(const Packet8ui &a, const Packet8ui &b)
Definition: AVX/PacketMath.h:1181
EIGEN_STRONG_INLINE Eigen::half predux_mul< Packet8h >(const Packet8h &a)
Definition: AVX/PacketMath.h:2473
EIGEN_STRONG_INLINE Packet8ui padd< Packet8ui >(const Packet8ui &a, const Packet8ui &b)
Definition: AVX/PacketMath.h:842
std::uint8_t uint8_t
Definition: Meta.h:36
std::int64_t int64_t
Definition: Meta.h:43
std::uint16_t uint16_t
Definition: Meta.h:38
std::uint32_t uint32_t
Definition: Meta.h:40
std::uint64_t uint64_t
Definition: Meta.h:42
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:83
double S0
Strength of source function in inner region.
Definition: stefan_boltzmann.cc:148
double S1
Strength of source function in outer region.
Definition: stefan_boltzmann.cc:151
r
Definition: UniformPSDSelfTest.py:20
int c
Definition: calibrate.py:100
Definition: Eigen_Colamd.h:49
list x
Definition: plotDoE.py:28
t
Definition: plotPSD.py:36
Definition: BFloat16.h:101
Definition: Half.h:139
Definition: GenericPacketMath.h:1407
Packet packet[N]
Definition: GenericPacketMath.h:1408
Definition: GenericPacketMath.h:1421
bool select[N]
Definition: GenericPacketMath.h:1422
Definition: GenericPacketMath.h:45
@ HasASin
Definition: GenericPacketMath.h:84
@ HasATanh
Definition: GenericPacketMath.h:87
@ HasRsqrt
Definition: GenericPacketMath.h:74
@ HasSin
Definition: GenericPacketMath.h:81
@ HasBlend
Definition: GenericPacketMath.h:66
@ HasErfc
Definition: GenericPacketMath.h:96
@ HasACos
Definition: GenericPacketMath.h:85
@ HasNdtri
Definition: GenericPacketMath.h:97
@ HasCos
Definition: GenericPacketMath.h:82
@ HasCmp
Definition: GenericPacketMath.h:69
@ HasReciprocal
Definition: GenericPacketMath.h:72
@ HasShift
Definition: GenericPacketMath.h:50
@ HasLog1p
Definition: GenericPacketMath.h:78
@ HasExp
Definition: GenericPacketMath.h:75
@ HasSqrt
Definition: GenericPacketMath.h:73
@ HasErf
Definition: GenericPacketMath.h:95
@ HasBessel
Definition: GenericPacketMath.h:98
@ HasExpm1
Definition: GenericPacketMath.h:76
@ HasLog
Definition: GenericPacketMath.h:77
@ HasTanh
Definition: GenericPacketMath.h:90
@ HasATan
Definition: GenericPacketMath.h:86
@ HasDiv
Definition: GenericPacketMath.h:71
Definition: GenericPacketMath.h:225
Definition: Meta.h:145
@ value
Definition: Meta.h:146
Packet8h type
Definition: AVX/PacketMath.h:161
Packet8h half
Definition: AVX/PacketMath.h:163
Packet8bf half
Definition: AVX/PacketMath.h:202
Packet8bf type
Definition: AVX/PacketMath.h:199
Packet2d half
Definition: AVX/PacketMath.h:134
Packet4d type
Definition: AVX/PacketMath.h:133
Packet8f type
Definition: AVX/PacketMath.h:101
Packet4f half
Definition: AVX/PacketMath.h:102
Packet8i type
Definition: AVX/PacketMath.h:238
Packet4i half
Definition: AVX/PacketMath.h:239
Packet8ui type
Definition: AVX/PacketMath.h:244
Packet4ui half
Definition: AVX/PacketMath.h:245
Definition: GenericPacketMath.h:108
T type
Definition: GenericPacketMath.h:109
@ size
Definition: GenericPacketMath.h:113
@ AlignedOnScalar
Definition: GenericPacketMath.h:114
@ Vectorizable
Definition: GenericPacketMath.h:112
T half
Definition: GenericPacketMath.h:110
@ HasSub
Definition: GenericPacketMath.h:118
@ HasMax
Definition: GenericPacketMath.h:124
@ HasNegate
Definition: GenericPacketMath.h:120
@ HasMul
Definition: GenericPacketMath.h:119
@ HasAdd
Definition: GenericPacketMath.h:117
@ HasSetLinear
Definition: GenericPacketMath.h:126
@ HasMin
Definition: GenericPacketMath.h:123
@ HasConj
Definition: GenericPacketMath.h:125
@ HasAbs2
Definition: GenericPacketMath.h:122
@ HasAbs
Definition: GenericPacketMath.h:121
Definition: XprHelper.h:883
@ value
Definition: XprHelper.h:884
@ mask
Definition: SSE/PacketMath.h:91
Packet2d half
Definition: AVX/PacketMath.h:325
double type
Definition: AVX/PacketMath.h:324
Packet8bf half
Definition: AVX/PacketMath.h:390
bfloat16 type
Definition: AVX/PacketMath.h:389
uint8_t mask_t
Definition: AVX/PacketMath.h:309
Packet8i integer_packet
Definition: AVX/PacketMath.h:308
Packet4f half
Definition: AVX/PacketMath.h:307
float type
Definition: AVX/PacketMath.h:306
Packet8h half
Definition: AVX/PacketMath.h:2212
Eigen::half type
Definition: AVX/PacketMath.h:2204
Packet4i half
Definition: AVX/PacketMath.h:340
int type
Definition: AVX/PacketMath.h:339
Packet4ui half
Definition: AVX/PacketMath.h:352
uint32_t type
Definition: AVX/PacketMath.h:351
Definition: GenericPacketMath.h:134
numext::get_integer_by_size< sizeof(T)>::signed_type integer_packet
Definition: GenericPacketMath.h:137
T type
Definition: GenericPacketMath.h:135
T half
Definition: GenericPacketMath.h:136
@ masked_load_available
Definition: GenericPacketMath.h:142
@ size
Definition: GenericPacketMath.h:139
@ masked_store_available
Definition: GenericPacketMath.h:143
@ vectorizable
Definition: GenericPacketMath.h:141
@ alignment
Definition: GenericPacketMath.h:140
std::ofstream out("Result.txt")
std::ptrdiff_t j
Definition: tut_arithmetic_redux_minmax.cpp:2