PacketMathFP16.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 //
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_PACKET_MATH_FP16_AVX512_H
11 #define EIGEN_PACKET_MATH_FP16_AVX512_H
12 
13 // IWYU pragma: private
14 #include "../../InternalHeaderCheck.h"
15 
16 namespace Eigen {
17 
18 namespace internal {
19 
20 typedef __m512h Packet32h;
23 
24 template <>
25 struct is_arithmetic<Packet8h> {
26  enum { value = true };
27 };
28 
29 template <>
30 struct packet_traits<half> : default_packet_traits {
31  typedef Packet32h type;
32  typedef Packet16h half;
33  enum {
34  Vectorizable = 1,
35  AlignedOnScalar = 1,
36  size = 32,
37 
38  HasCmp = 1,
39  HasAdd = 1,
40  HasSub = 1,
41  HasMul = 1,
42  HasDiv = 1,
43  HasNegate = 1,
44  HasAbs = 1,
45  HasAbs2 = 0,
46  HasMin = 1,
47  HasMax = 1,
48  HasConj = 1,
49  HasSetLinear = 0,
50  HasLog = 1,
51  HasLog1p = 1,
52  HasExp = 1,
53  HasExpm1 = 1,
54  HasSqrt = 1,
55  HasRsqrt = 1,
56  // These ones should be implemented in future
57  HasBessel = 0,
58  HasNdtri = 0,
62  HasErf = 0, // EIGEN_FAST_MATH,
63  HasBlend = 0
64  };
65 };
66 
67 template <>
69  typedef Eigen::half type;
70  typedef Packet16h half;
71  enum {
72  size = 32,
74  vectorizable = true,
77  };
78 };
79 
80 template <>
81 struct unpacket_traits<Packet16h> {
82  typedef Eigen::half type;
83  typedef Packet8h half;
84  enum {
85  size = 16,
87  vectorizable = true,
88  masked_load_available = false,
90  };
91 };
92 
93 template <>
94 struct unpacket_traits<Packet8h> {
95  typedef Eigen::half type;
96  typedef Packet8h half;
97  enum {
98  size = 8,
100  vectorizable = true,
101  masked_load_available = false,
102  masked_store_available = false
103  };
104 };
105 
106 // Memory functions
107 
108 // pset1
109 
110 template <>
112  // half/half_raw is bit compatible
113  return _mm512_set1_ph(numext::bit_cast<_Float16>(from));
114 }
115 
116 template <>
118  return _mm512_setzero_ph();
119 }
120 
121 // pset1frombits
122 template <>
124  return _mm512_castsi512_ph(_mm512_set1_epi16(from));
125 }
126 
127 // pfirst
128 
129 template <>
131 #ifdef EIGEN_VECTORIZE_AVX512DQ
133  static_cast<unsigned short>(_mm256_extract_epi16(_mm512_extracti32x8_epi32(_mm512_castph_si512(from), 0), 0)));
134 #else
135  Eigen::half dest[32];
136  _mm512_storeu_ph(dest, from);
137  return dest[0];
138 #endif
139 }
140 
141 // pload
142 
143 template <>
145  EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ph(from);
146 }
147 
148 // ploadu
149 
150 template <>
152  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_ph(from);
153 }
154 
155 // pstore
156 
157 template <>
159  EIGEN_DEBUG_ALIGNED_STORE _mm512_store_ph(to, from);
160 }
161 
162 // pstoreu
163 
164 template <>
166  EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_ph(to, from);
167 }
168 
169 // ploaddup
170 template <>
172  __m512h a = _mm512_castph256_ph512(_mm256_loadu_ph(from));
173  return _mm512_permutexvar_ph(_mm512_set_epi16(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6,
174  5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0),
175  a);
176 }
177 
178 // ploadquad
179 template <>
181  __m512h a = _mm512_castph128_ph512(_mm_loadu_ph(from));
182  return _mm512_permutexvar_ph(
183  _mm512_set_epi16(7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0),
184  a);
185 }
186 
187 // pabs
188 
189 template <>
191  return _mm512_abs_ph(a);
192 }
193 
194 // psignbit
195 
196 template <>
198  return _mm512_castsi512_ph(_mm512_srai_epi16(_mm512_castph_si512(a), 15));
199 }
200 
201 // pmin
202 
203 template <>
205  return _mm512_min_ph(a, b);
206 }
207 
208 // pmax
209 
210 template <>
212  return _mm512_max_ph(a, b);
213 }
214 
215 // plset
216 template <>
218  return _mm512_add_ph(pset1<Packet32h>(a), _mm512_set_ph(31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
219  16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
220 }
221 
222 // por
223 
224 template <>
226  return _mm512_castsi512_ph(_mm512_or_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
227 }
228 
229 // pxor
230 
231 template <>
233  return _mm512_castsi512_ph(_mm512_xor_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
234 }
235 
236 // pand
237 
238 template <>
240  return _mm512_castsi512_ph(_mm512_and_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
241 }
242 
243 // pandnot
244 
245 template <>
247  return _mm512_castsi512_ph(_mm512_andnot_si512(_mm512_castph_si512(b), _mm512_castph_si512(a)));
248 }
249 
250 // pselect
251 
252 template <>
253 EIGEN_DEVICE_FUNC inline Packet32h pselect(const Packet32h& mask, const Packet32h& a, const Packet32h& b) {
254  __mmask32 mask32 = _mm512_cmp_epi16_mask(_mm512_castph_si512(mask), _mm512_setzero_epi32(), _MM_CMPINT_EQ);
255  return _mm512_mask_blend_ph(mask32, a, b);
256 }
257 
258 // pcmp_eq
259 
260 template <>
262  __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_EQ_OQ);
263  return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, static_cast<short>(0xffffu)));
264 }
265 
266 // pcmp_le
267 
268 template <>
270  __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_LE_OQ);
271  return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, static_cast<short>(0xffffu)));
272 }
273 
274 // pcmp_lt
275 
276 template <>
278  __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_LT_OQ);
279  return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, static_cast<short>(0xffffu)));
280 }
281 
282 // pcmp_lt_or_nan
283 
284 template <>
286  __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_NGE_UQ);
287  return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi16(0), mask, static_cast<short>(0xffffu)));
288 }
289 
290 // padd
291 
292 template <>
294  return _mm512_add_ph(a, b);
295 }
296 
297 template <>
299  return _mm256_castph_si256(_mm256_add_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
300 }
301 
302 template <>
304  return _mm_castph_si128(_mm_add_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
305 }
306 
307 // psub
308 
309 template <>
311  return _mm512_sub_ph(a, b);
312 }
313 
314 template <>
316  return _mm256_castph_si256(_mm256_sub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
317 }
318 
319 template <>
321  return _mm_castph_si128(_mm_sub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
322 }
323 
324 // pmul
325 
326 template <>
328  return _mm512_mul_ph(a, b);
329 }
330 
331 template <>
333  return _mm256_castph_si256(_mm256_mul_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
334 }
335 
336 template <>
338  return _mm_castph_si128(_mm_mul_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
339 }
340 
341 // pdiv
342 
343 template <>
345  return _mm512_div_ph(a, b);
346 }
347 
348 template <>
350  return _mm256_castph_si256(_mm256_div_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b)));
351 }
352 
353 template <>
355  return _mm_castph_si128(_mm_div_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b)));
356 }
357 
358 // pround
359 
360 template <>
362  // Work-around for default std::round rounding mode.
363 
364  // Mask for the sign bit
365  const Packet32h signMask = pset1frombits<Packet32h>(static_cast<numext::uint16_t>(0x8000u));
366  // The largest half-preicision float less than 0.5
367  const Packet32h prev0dot5 = pset1frombits<Packet32h>(static_cast<numext::uint16_t>(0x37FFu));
368 
369  return _mm512_roundscale_ph(padd(por(pand(a, signMask), prev0dot5), a), _MM_FROUND_TO_ZERO);
370 }
371 
372 // print
373 
374 template <>
376  return _mm512_roundscale_ph(a, _MM_FROUND_CUR_DIRECTION);
377 }
378 
379 // pceil
380 
381 template <>
383  return _mm512_roundscale_ph(a, _MM_FROUND_TO_POS_INF);
384 }
385 
386 // pfloor
387 
388 template <>
390  return _mm512_roundscale_ph(a, _MM_FROUND_TO_NEG_INF);
391 }
392 
393 // ptrunc
394 
395 template <>
397  return _mm512_roundscale_ph(a, _MM_FROUND_TO_ZERO);
398 }
399 
400 // predux
401 template <>
403  return (half)_mm512_reduce_add_ph(a);
404 }
405 
406 template <>
408  return (half)_mm256_reduce_add_ph(_mm256_castsi256_ph(a));
409 }
410 
411 template <>
413  return (half)_mm_reduce_add_ph(_mm_castsi128_ph(a));
414 }
415 
416 // predux_half_dowto4
417 template <>
419 #ifdef EIGEN_VECTORIZE_AVX512DQ
420  __m256i lowHalf = _mm256_castps_si256(_mm512_extractf32x8_ps(_mm512_castph_ps(a), 0));
421  __m256i highHalf = _mm256_castps_si256(_mm512_extractf32x8_ps(_mm512_castph_ps(a), 1));
422 
423  return Packet16h(padd<Packet16h>(lowHalf, highHalf));
424 #else
425  Eigen::half data[32];
426  _mm512_storeu_ph(data, a);
427 
428  __m256i lowHalf = _mm256_castph_si256(_mm256_loadu_ph(data));
429  __m256i highHalf = _mm256_castph_si256(_mm256_loadu_ph(data + 16));
430 
431  return Packet16h(padd<Packet16h>(lowHalf, highHalf));
432 #endif
433 }
434 
435 // predux_max
436 
437 // predux_min
438 
439 // predux_mul
440 
441 #ifdef EIGEN_VECTORIZE_FMA
442 
443 // pmadd
444 
445 template <>
447  return _mm512_fmadd_ph(a, b, c);
448 }
449 
450 template <>
452  return _mm256_castph_si256(_mm256_fmadd_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
453 }
454 
455 template <>
456 EIGEN_STRONG_INLINE Packet8h pmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
457  return _mm_castph_si128(_mm_fmadd_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
458 }
459 
460 // pmsub
461 
462 template <>
464  return _mm512_fmsub_ph(a, b, c);
465 }
466 
467 template <>
469  return _mm256_castph_si256(_mm256_fmsub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
470 }
471 
472 template <>
473 EIGEN_STRONG_INLINE Packet8h pmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
474  return _mm_castph_si128(_mm_fmsub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
475 }
476 
477 // pnmadd
478 
479 template <>
481  return _mm512_fnmadd_ph(a, b, c);
482 }
483 
484 template <>
486  return _mm256_castph_si256(_mm256_fnmadd_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
487 }
488 
489 template <>
490 EIGEN_STRONG_INLINE Packet8h pnmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
491  return _mm_castph_si128(_mm_fnmadd_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
492 }
493 
494 // pnmsub
495 
496 template <>
498  return _mm512_fnmsub_ph(a, b, c);
499 }
500 
501 template <>
503  return _mm256_castph_si256(_mm256_fnmsub_ph(_mm256_castsi256_ph(a), _mm256_castsi256_ph(b), _mm256_castsi256_ph(c)));
504 }
505 
506 template <>
507 EIGEN_STRONG_INLINE Packet8h pnmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
508  return _mm_castph_si128(_mm_fnmsub_ph(_mm_castsi128_ph(a), _mm_castsi128_ph(b), _mm_castsi128_ph(c)));
509 }
510 
511 #endif
512 
513 // pnegate
514 
515 template <>
517  return psub(pzero(a), a);
518 }
519 
520 // pconj
521 
522 template <>
524  return a;
525 }
526 
527 // psqrt
528 
529 template <>
531  return _mm512_sqrt_ph(a);
532 }
533 
534 // prsqrt
535 
536 template <>
538  return _mm512_rsqrt_ph(a);
539 }
540 
541 // preciprocal
542 
543 template <>
545  return _mm512_rcp_ph(a);
546 }
547 
548 // ptranspose
549 
551  __m512i t[32];
552 
554  for (int i = 0; i < 16; i++) {
555  t[2 * i] = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[2 * i]), _mm512_castph_si512(a.packet[2 * i + 1]));
556  t[2 * i + 1] =
557  _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[2 * i]), _mm512_castph_si512(a.packet[2 * i + 1]));
558  }
559 
560  __m512i p[32];
561 
563  for (int i = 0; i < 8; i++) {
564  p[4 * i] = _mm512_unpacklo_epi32(t[4 * i], t[4 * i + 2]);
565  p[4 * i + 1] = _mm512_unpackhi_epi32(t[4 * i], t[4 * i + 2]);
566  p[4 * i + 2] = _mm512_unpacklo_epi32(t[4 * i + 1], t[4 * i + 3]);
567  p[4 * i + 3] = _mm512_unpackhi_epi32(t[4 * i + 1], t[4 * i + 3]);
568  }
569 
570  __m512i q[32];
571 
573  for (int i = 0; i < 4; i++) {
574  q[8 * i] = _mm512_unpacklo_epi64(p[8 * i], p[8 * i + 4]);
575  q[8 * i + 1] = _mm512_unpackhi_epi64(p[8 * i], p[8 * i + 4]);
576  q[8 * i + 2] = _mm512_unpacklo_epi64(p[8 * i + 1], p[8 * i + 5]);
577  q[8 * i + 3] = _mm512_unpackhi_epi64(p[8 * i + 1], p[8 * i + 5]);
578  q[8 * i + 4] = _mm512_unpacklo_epi64(p[8 * i + 2], p[8 * i + 6]);
579  q[8 * i + 5] = _mm512_unpackhi_epi64(p[8 * i + 2], p[8 * i + 6]);
580  q[8 * i + 6] = _mm512_unpacklo_epi64(p[8 * i + 3], p[8 * i + 7]);
581  q[8 * i + 7] = _mm512_unpackhi_epi64(p[8 * i + 3], p[8 * i + 7]);
582  }
583 
584  __m512i f[32];
585 
586 #define PACKET32H_TRANSPOSE_HELPER(X, Y) \
587  do { \
588  f[Y * 8] = _mm512_inserti32x4(f[Y * 8], _mm512_extracti32x4_epi32(q[X * 8], Y), X); \
589  f[Y * 8 + 1] = _mm512_inserti32x4(f[Y * 8 + 1], _mm512_extracti32x4_epi32(q[X * 8 + 1], Y), X); \
590  f[Y * 8 + 2] = _mm512_inserti32x4(f[Y * 8 + 2], _mm512_extracti32x4_epi32(q[X * 8 + 2], Y), X); \
591  f[Y * 8 + 3] = _mm512_inserti32x4(f[Y * 8 + 3], _mm512_extracti32x4_epi32(q[X * 8 + 3], Y), X); \
592  f[Y * 8 + 4] = _mm512_inserti32x4(f[Y * 8 + 4], _mm512_extracti32x4_epi32(q[X * 8 + 4], Y), X); \
593  f[Y * 8 + 5] = _mm512_inserti32x4(f[Y * 8 + 5], _mm512_extracti32x4_epi32(q[X * 8 + 5], Y), X); \
594  f[Y * 8 + 6] = _mm512_inserti32x4(f[Y * 8 + 6], _mm512_extracti32x4_epi32(q[X * 8 + 6], Y), X); \
595  f[Y * 8 + 7] = _mm512_inserti32x4(f[Y * 8 + 7], _mm512_extracti32x4_epi32(q[X * 8 + 7], Y), X); \
596  } while (false);
597 
602 
609 
616 
617 #undef PACKET32H_TRANSPOSE_HELPER
618 
620  for (int i = 0; i < 32; i++) {
621  a.packet[i] = _mm512_castsi512_ph(f[i]);
622  }
623 }
624 
626  __m512i p0, p1, p2, p3, t0, t1, t2, t3, a0, a1, a2, a3;
627  t0 = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[0]), _mm512_castph_si512(a.packet[1]));
628  t1 = _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[0]), _mm512_castph_si512(a.packet[1]));
629  t2 = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[2]), _mm512_castph_si512(a.packet[3]));
630  t3 = _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[2]), _mm512_castph_si512(a.packet[3]));
631 
632  p0 = _mm512_unpacklo_epi32(t0, t2);
633  p1 = _mm512_unpackhi_epi32(t0, t2);
634  p2 = _mm512_unpacklo_epi32(t1, t3);
635  p3 = _mm512_unpackhi_epi32(t1, t3);
636 
637  a0 = p0;
638  a1 = p1;
639  a2 = p2;
640  a3 = p3;
641 
642  a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p1, 0), 1);
643  a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p0, 1), 0);
644 
645  a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p2, 0), 2);
646  a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p0, 2), 0);
647 
648  a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p3, 0), 3);
649  a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p0, 3), 0);
650 
651  a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p2, 1), 2);
652  a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p1, 2), 1);
653 
654  a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p3, 2), 3);
655  a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p2, 3), 2);
656 
657  a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p3, 1), 3);
658  a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p1, 3), 1);
659 
660  a.packet[0] = _mm512_castsi512_ph(a0);
661  a.packet[1] = _mm512_castsi512_ph(a1);
662  a.packet[2] = _mm512_castsi512_ph(a2);
663  a.packet[3] = _mm512_castsi512_ph(a3);
664 }
665 
666 // preverse
667 
668 template <>
670  return _mm512_permutexvar_ph(_mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
671  20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31),
672  a);
673 }
674 
675 // pscatter
676 
677 template <>
679  EIGEN_ALIGN64 half aux[32];
680  pstore(aux, from);
681 
683  for (int i = 0; i < 32; i++) {
684  to[stride * i] = aux[i];
685  }
686 }
687 
688 // pgather
689 
690 template <>
691 EIGEN_STRONG_INLINE Packet32h pgather<Eigen::half, Packet32h>(const Eigen::half* from, Index stride) {
692  return _mm512_castsi512_ph(_mm512_set_epi16(
693  from[31 * stride].x, from[30 * stride].x, from[29 * stride].x, from[28 * stride].x, from[27 * stride].x,
694  from[26 * stride].x, from[25 * stride].x, from[24 * stride].x, from[23 * stride].x, from[22 * stride].x,
695  from[21 * stride].x, from[20 * stride].x, from[19 * stride].x, from[18 * stride].x, from[17 * stride].x,
696  from[16 * stride].x, from[15 * stride].x, from[14 * stride].x, from[13 * stride].x, from[12 * stride].x,
697  from[11 * stride].x, from[10 * stride].x, from[9 * stride].x, from[8 * stride].x, from[7 * stride].x,
698  from[6 * stride].x, from[5 * stride].x, from[4 * stride].x, from[3 * stride].x, from[2 * stride].x,
699  from[1 * stride].x, from[0 * stride].x));
700 }
701 
702 template <>
704 template <>
706 template <>
708 template <>
710 template <>
712 template <>
714 template <>
716 template <>
718 template <>
720 template <>
722 
724  __m512d result = _mm512_undefined_pd();
725  result = _mm512_insertf64x4(result, _mm256_castsi256_pd(a), 0);
726  result = _mm512_insertf64x4(result, _mm256_castsi256_pd(b), 1);
727  return _mm512_castpd_ph(result);
728 }
729 
731  a = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(x), 0));
732  b = _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(x), 1));
733 }
734 
735 // psin
736 template <>
738  Packet16h low;
739  Packet16h high;
740  extract2Packet16h(a, low, high);
741 
742  Packet16h lowOut = psin(low);
743  Packet16h highOut = psin(high);
744 
745  return combine2Packet16h(lowOut, highOut);
746 }
747 
748 // pcos
749 template <>
751  Packet16h low;
752  Packet16h high;
753  extract2Packet16h(a, low, high);
754 
755  Packet16h lowOut = pcos(low);
756  Packet16h highOut = pcos(high);
757 
758  return combine2Packet16h(lowOut, highOut);
759 }
760 
761 // plog
762 template <>
764  Packet16h low;
765  Packet16h high;
766  extract2Packet16h(a, low, high);
767 
768  Packet16h lowOut = plog(low);
769  Packet16h highOut = plog(high);
770 
771  return combine2Packet16h(lowOut, highOut);
772 }
773 
774 // plog2
775 template <>
777  Packet16h low;
778  Packet16h high;
779  extract2Packet16h(a, low, high);
780 
781  Packet16h lowOut = plog2(low);
782  Packet16h highOut = plog2(high);
783 
784  return combine2Packet16h(lowOut, highOut);
785 }
786 
787 // plog1p
788 template <>
790  Packet16h low;
791  Packet16h high;
792  extract2Packet16h(a, low, high);
793 
794  Packet16h lowOut = plog1p(low);
795  Packet16h highOut = plog1p(high);
796 
797  return combine2Packet16h(lowOut, highOut);
798 }
799 
800 // pexp
801 template <>
803  Packet16h low;
804  Packet16h high;
805  extract2Packet16h(a, low, high);
806 
807  Packet16h lowOut = pexp(low);
808  Packet16h highOut = pexp(high);
809 
810  return combine2Packet16h(lowOut, highOut);
811 }
812 
813 // pexpm1
814 template <>
816  Packet16h low;
817  Packet16h high;
818  extract2Packet16h(a, low, high);
819 
820  Packet16h lowOut = pexpm1(low);
821  Packet16h highOut = pexpm1(high);
822 
823  return combine2Packet16h(lowOut, highOut);
824 }
825 
826 // ptanh
827 template <>
829  Packet16h low;
830  Packet16h high;
831  extract2Packet16h(a, low, high);
832 
833  Packet16h lowOut = ptanh(low);
834  Packet16h highOut = ptanh(high);
835 
836  return combine2Packet16h(lowOut, highOut);
837 }
838 
839 // pfrexp
840 template <>
842  Packet16h low;
843  Packet16h high;
844  extract2Packet16h(a, low, high);
845 
846  Packet16h exp1 = _mm256_undefined_si256();
847  Packet16h exp2 = _mm256_undefined_si256();
848 
849  Packet16h lowOut = pfrexp(low, exp1);
850  Packet16h highOut = pfrexp(high, exp2);
851 
852  exponent = combine2Packet16h(exp1, exp2);
853 
854  return combine2Packet16h(lowOut, highOut);
855 }
856 
857 // pldexp
858 template <>
860  Packet16h low;
861  Packet16h high;
862  extract2Packet16h(a, low, high);
863 
864  Packet16h exp1;
865  Packet16h exp2;
866  extract2Packet16h(exponent, exp1, exp2);
867 
868  Packet16h lowOut = pldexp(low, exp1);
869  Packet16h highOut = pldexp(high, exp2);
870 
871  return combine2Packet16h(lowOut, highOut);
872 }
873 
874 } // end namespace internal
875 } // end namespace Eigen
876 
877 #endif // EIGEN_PACKET_MATH_FP16_AVX512_H
int i
Definition: BiCGSTAB_step_by_step.cpp:9
#define EIGEN_ALIGN64
Definition: ConfigureVectorization.h:144
#define EIGEN_DEBUG_ALIGNED_STORE
Definition: GenericPacketMath.h:38
#define EIGEN_DEBUG_ALIGNED_LOAD
Definition: GenericPacketMath.h:30
#define EIGEN_DEBUG_UNALIGNED_STORE
Definition: GenericPacketMath.h:42
#define EIGEN_DEBUG_UNALIGNED_LOAD
Definition: GenericPacketMath.h:34
#define EIGEN_UNROLL_LOOP
Definition: Macros.h:1298
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:892
#define EIGEN_FAST_MATH
Definition: Macros.h:51
#define EIGEN_STRONG_INLINE
Definition: Macros.h:834
int data[]
Definition: Map_placement_new.cpp:1
Vector3f p0
Definition: MatrixBase_all.cpp:2
Vector3f p1
Definition: MatrixBase_all.cpp:2
#define PACKET32H_TRANSPOSE_HELPER(X, Y)
float * p
Definition: Tutorial_Map_using.cpp:9
Scalar * b
Definition: benchVecAdd.cpp:17
static int f(const TensorMap< Tensor< int, 3 > > &tensor)
Definition: cxx11_tensor_map.cpp:237
@ Aligned64
Definition: Constants.h:239
@ Aligned32
Definition: Constants.h:238
@ Aligned16
Definition: Constants.h:237
const Scalar * a
Definition: level2_cplx_impl.h:32
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 exp2(const bfloat16 &a)
Definition: BFloat16.h:616
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x)
Definition: Half.h:496
EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexpm1(const Packet &a)
Definition: GenericPacketMath.h:1097
EIGEN_STRONG_INLINE Packet32h psqrt< Packet32h >(const Packet32h &a)
Definition: PacketMathFP16.h:530
EIGEN_STRONG_INLINE Eigen::half pfirst< Packet32h >(const Packet32h &from)
Definition: PacketMathFP16.h:130
EIGEN_STRONG_INLINE Packet32h plog2< Packet32h >(const Packet32h &a)
Definition: PacketMathFP16.h:776
EIGEN_STRONG_INLINE Packet32h print< Packet32h >(const Packet32h &a)
Definition: PacketMathFP16.h:375
EIGEN_DEVICE_FUNC Packet padd(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:318
EIGEN_STRONG_INLINE Packet32h pdiv< Packet32h >(const Packet32h &a, const Packet32h &b)
Definition: PacketMathFP16.h:344
EIGEN_STRONG_INLINE Packet8h pmul< Packet8h >(const Packet8h &a, const Packet8h &b)
Definition: AVX/PacketMath.h:2406
EIGEN_STRONG_INLINE Packet8f pzero(const Packet8f &)
Definition: AVX/PacketMath.h:774
EIGEN_STRONG_INLINE void pscatter< half, Packet32h >(half *to, const Packet32h &from, Index stride)
Definition: PacketMathFP16.h:678
EIGEN_STRONG_INLINE Packet32h psignbit< Packet32h >(const Packet32h &a)
Definition: PacketMathFP16.h:197
EIGEN_STRONG_INLINE Packet32h padd< Packet32h >(const Packet32h &a, const Packet32h &b)
Definition: PacketMathFP16.h:293
EIGEN_STRONG_INLINE Packet16h pfrexp< Packet16h >(const Packet16h &, Packet16h &)
EIGEN_STRONG_INLINE Packet32h psin< Packet32h >(const Packet32h &a)
Definition: PacketMathFP16.h:737
EIGEN_STRONG_INLINE half predux< Packet32h >(const Packet32h &a)
Definition: PacketMathFP16.h:402
EIGEN_STRONG_INLINE Packet16h pexp< Packet16h >(const Packet16h &)
EIGEN_STRONG_INLINE Packet32h ploadquad< Packet32h >(const Eigen::half *from)
Definition: PacketMathFP16.h:180
EIGEN_STRONG_INLINE Packet32h pnegate< Packet32h >(const Packet32h &a)
Definition: PacketMathFP16.h:516
EIGEN_STRONG_INLINE Packet8h psub< Packet8h >(const Packet8h &a, const Packet8h &b)
Definition: AVX/PacketMath.h:2398
EIGEN_STRONG_INLINE Packet32h ploaddup< Packet32h >(const Eigen::half *from)
Definition: PacketMathFP16.h:171
EIGEN_STRONG_INLINE Packet16h plog< Packet16h >(const Packet16h &)
EIGEN_STRONG_INLINE void ptranspose(PacketBlock< Packet2cf, 2 > &kernel)
Definition: AltiVec/Complex.h:339
EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2(const Packet &a)
Definition: GenericPacketMath.h:1123
EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog(const Packet &a)
Definition: GenericPacketMath.h:1103
EIGEN_STRONG_INLINE Packet16h psin< Packet16h >(const Packet16h &)
EIGEN_STRONG_INLINE Packet32h pabs< Packet32h >(const Packet32h &a)
Definition: PacketMathFP16.h:190
EIGEN_STRONG_INLINE Packet16h ptanh< Packet16h >(const Packet16h &)
EIGEN_STRONG_INLINE Packet32h plog< Packet32h >(const Packet32h &a)
Definition: PacketMathFP16.h:763
EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos(const Packet &a)
Definition: GenericPacketMath.h:1022
EIGEN_STRONG_INLINE Packet32h combine2Packet16h(const Packet16h &a, const Packet16h &b)
Definition: PacketMathFP16.h:723
EIGEN_STRONG_INLINE Packet32h pset1frombits< Packet32h >(unsigned short from)
Definition: PacketMathFP16.h:123
EIGEN_STRONG_INLINE Eigen::half predux< Packet8h >(const Packet8h &a)
Definition: AVX/PacketMath.h:2451
EIGEN_STRONG_INLINE Packet32h psub< Packet32h >(const Packet32h &a, const Packet32h &b)
Definition: PacketMathFP16.h:310
EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f &a, const Packet4f &b)
Definition: AltiVec/PacketMath.h:1314
EIGEN_STRONG_INLINE Packet32h pcos< Packet32h >(const Packet32h &a)
Definition: PacketMathFP16.h:750
EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin(const Packet &a)
Definition: GenericPacketMath.h:1015
EIGEN_STRONG_INLINE Packet32h pload< Packet32h >(const Eigen::half *from)
Definition: PacketMathFP16.h:144
EIGEN_STRONG_INLINE Packet8h pdiv< Packet8h >(const Packet8h &a, const Packet8h &b)
Definition: AVX/PacketMath.h:2414
EIGEN_STRONG_INLINE Packet16h pexpm1< Packet16h >(const Packet16h &)
EIGEN_STRONG_INLINE Packet8h por(const Packet8h &a, const Packet8h &b)
Definition: AVX/PacketMath.h:2309
EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i &a, const Packet4i &b)
Definition: AltiVec/PacketMath.h:1341
EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf &a)
Definition: AltiVec/Complex.h:303
EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
Definition: AltiVec/PacketMath.h:1218
EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h &a, const Packet8h &b)
Definition: AVX/PacketMath.h:2323
EIGEN_STRONG_INLINE Packet32h ptanh< Packet32h >(const Packet32h &a)
Definition: PacketMathFP16.h:828
EIGEN_STRONG_INLINE Packet32h plset< Packet32h >(const half &a)
Definition: PacketMathFP16.h:217
EIGEN_STRONG_INLINE Packet32h pset1< Packet32h >(const Eigen::half &from)
Definition: PacketMathFP16.h:111
EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet ptanh(const Packet &a)
Definition: GenericPacketMath.h:1071
EIGEN_STRONG_INLINE Packet32h pfrexp< Packet32h >(const Packet32h &a, Packet32h &exponent)
Definition: PacketMathFP16.h:841
EIGEN_STRONG_INLINE void pstoreu< half >(Eigen::half *to, const Packet16h &from)
Definition: AVX512/PacketMath.h:2230
EIGEN_STRONG_INLINE Packet32h prsqrt< Packet32h >(const Packet32h &a)
Definition: PacketMathFP16.h:537
EIGEN_STRONG_INLINE Packet16h plog2< Packet16h >(const Packet16h &)
EIGEN_STRONG_INLINE Packet32h pldexp< Packet32h >(const Packet32h &a, const Packet32h &exponent)
Definition: PacketMathFP16.h:859
EIGEN_STRONG_INLINE Packet32h pexpm1< Packet32h >(const Packet32h &a)
Definition: PacketMathFP16.h:815
EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog1p(const Packet &a)
Definition: GenericPacketMath.h:1110
EIGEN_STRONG_INLINE Packet32h pfloor< Packet32h >(const Packet32h &a)
Definition: PacketMathFP16.h:389
EIGEN_STRONG_INLINE Packet16h padd< Packet16h >(const Packet16h &a, const Packet16h &b)
Definition: AVX512/PacketMath.h:2374
EIGEN_STRONG_INLINE void extract2Packet16h(const Packet32h &x, Packet16h &a, Packet16h &b)
Definition: PacketMathFP16.h:730
EIGEN_DEVICE_FUNC void pstore(Scalar *to, const Packet &from)
Definition: GenericPacketMath.h:891
EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f &a, const Packet4f &b, const Packet4f &c)
Definition: LSX/PacketMath.h:835
EIGEN_STRONG_INLINE Packet32h preciprocal< Packet32h >(const Packet32h &a)
Definition: PacketMathFP16.h:544
EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf &a, const Packet2cf &b)
Definition: AltiVec/Complex.h:353
EIGEN_STRONG_INLINE Packet8h pldexp(const Packet8h &a, const Packet8h &exponent)
Definition: arch/AVX/MathFunctions.h:80
EIGEN_STRONG_INLINE Packet32h pround< Packet32h >(const Packet32h &a)
Definition: PacketMathFP16.h:361
EIGEN_STRONG_INLINE Packet4f pmsub(const Packet4f &a, const Packet4f &b, const Packet4f &c)
Definition: LSX/PacketMath.h:819
EIGEN_STRONG_INLINE Packet8h pand(const Packet8h &a, const Packet8h &b)
Definition: AVX/PacketMath.h:2319
EIGEN_STRONG_INLINE Packet16h pcos< Packet16h >(const Packet16h &)
EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h &a, const Packet8h &b)
Definition: AVX/PacketMath.h:2315
EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
Definition: LSX/PacketMath.h:827
EIGEN_STRONG_INLINE half predux< Packet16h >(const Packet16h &from)
Definition: AVX512/PacketMath.h:2406
EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f &mask, const Packet4f &a, const Packet4f &b)
Definition: AltiVec/PacketMath.h:1474
EIGEN_STRONG_INLINE Packet32h ploadu< Packet32h >(const Eigen::half *from)
Definition: PacketMathFP16.h:151
EIGEN_STRONG_INLINE Packet32h pceil< Packet32h >(const Packet32h &a)
Definition: PacketMathFP16.h:382
EIGEN_DEVICE_FUNC Packet psub(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:337
EIGEN_STRONG_INLINE void pstore< half >(Eigen::half *to, const Packet16h &from)
Definition: AVX512/PacketMath.h:2223
EIGEN_STRONG_INLINE Packet16h plog1p< Packet16h >(const Packet16h &)
EIGEN_STRONG_INLINE Packet32h ptrunc< Packet32h >(const Packet32h &a)
Definition: PacketMathFP16.h:396
EIGEN_STRONG_INLINE Packet16h predux_half_dowto4< Packet32h >(const Packet32h &a)
Definition: PacketMathFP16.h:418
EIGEN_STRONG_INLINE Packet32h pmul< Packet32h >(const Packet32h &a, const Packet32h &b)
Definition: PacketMathFP16.h:327
EIGEN_STRONG_INLINE Packet32h pmin< Packet32h >(const Packet32h &a, const Packet32h &b)
Definition: PacketMathFP16.h:204
EIGEN_STRONG_INLINE Packet8h pfrexp(const Packet8h &a, Packet8h &exponent)
Definition: arch/AVX/MathFunctions.h:72
EIGEN_STRONG_INLINE Packet16h pmul< Packet16h >(const Packet16h &a, const Packet16h &b)
Definition: AVX512/PacketMath.h:2390
EIGEN_STRONG_INLINE Packet8h padd< Packet8h >(const Packet8h &a, const Packet8h &b)
Definition: AVX/PacketMath.h:2390
EIGEN_STRONG_INLINE Packet16h pdiv< Packet16h >(const Packet16h &a, const Packet16h &b)
Definition: AVX512/PacketMath.h:2398
EIGEN_STRONG_INLINE Packet32h plog1p< Packet32h >(const Packet32h &a)
Definition: PacketMathFP16.h:789
eigen_packet_wrapper< __m256i, 1 > Packet16h
Definition: AVX512/PacketMath.h:39
EIGEN_STRONG_INLINE Packet32h pmax< Packet32h >(const Packet32h &a, const Packet32h &b)
Definition: PacketMathFP16.h:211
EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f &a, const Packet4f &b)
Definition: AltiVec/PacketMath.h:1329
EIGEN_STRONG_INLINE Packet32h pexp< Packet32h >(const Packet32h &a)
Definition: PacketMathFP16.h:802
__m512h Packet32h
Definition: PacketMathFP16.h:20
eigen_packet_wrapper< __m128i, 2 > Packet8h
Definition: AVX/PacketMath.h:38
EIGEN_STRONG_INLINE Packet32h pconj< Packet32h >(const Packet32h &a)
Definition: PacketMathFP16.h:523
EIGEN_STRONG_INLINE Packet4f pexp(const Packet4f &_x)
Definition: LSX/PacketMath.h:2663
EIGEN_STRONG_INLINE Packet16h psub< Packet16h >(const Packet16h &a, const Packet16h &b)
Definition: AVX512/PacketMath.h:2382
EIGEN_STRONG_INLINE Packet16h pldexp< Packet16h >(const Packet16h &, const Packet16h &)
EIGEN_DEVICE_FUNC const Scalar & q
Definition: SpecialFunctionsImpl.h:2019
std::uint16_t uint16_t
Definition: Meta.h:38
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:83
int c
Definition: calibrate.py:100
Definition: Eigen_Colamd.h:49
list x
Definition: plotDoE.py:28
t
Definition: plotPSD.py:36
Definition: Half.h:139
Definition: GenericPacketMath.h:1407
@ HasRsqrt
Definition: GenericPacketMath.h:74
@ HasSin
Definition: GenericPacketMath.h:81
@ HasBlend
Definition: GenericPacketMath.h:66
@ HasNdtri
Definition: GenericPacketMath.h:97
@ HasCos
Definition: GenericPacketMath.h:82
@ HasCmp
Definition: GenericPacketMath.h:69
@ HasLog1p
Definition: GenericPacketMath.h:78
@ HasExp
Definition: GenericPacketMath.h:75
@ HasSqrt
Definition: GenericPacketMath.h:73
@ HasErf
Definition: GenericPacketMath.h:95
@ HasBessel
Definition: GenericPacketMath.h:98
@ HasExpm1
Definition: GenericPacketMath.h:76
@ HasLog
Definition: GenericPacketMath.h:77
@ HasTanh
Definition: GenericPacketMath.h:90
@ HasDiv
Definition: GenericPacketMath.h:71
Definition: GenericPacketMath.h:225
Definition: Meta.h:145
@ value
Definition: Meta.h:146
Packet32h type
Definition: PacketMathFP16.h:31
Packet16h half
Definition: PacketMathFP16.h:32
@ size
Definition: GenericPacketMath.h:113
@ AlignedOnScalar
Definition: GenericPacketMath.h:114
@ Vectorizable
Definition: GenericPacketMath.h:112
@ HasSub
Definition: GenericPacketMath.h:118
@ HasMax
Definition: GenericPacketMath.h:124
@ HasNegate
Definition: GenericPacketMath.h:120
@ HasMul
Definition: GenericPacketMath.h:119
@ HasAdd
Definition: GenericPacketMath.h:117
@ HasSetLinear
Definition: GenericPacketMath.h:126
@ HasMin
Definition: GenericPacketMath.h:123
@ HasConj
Definition: GenericPacketMath.h:125
@ HasAbs2
Definition: GenericPacketMath.h:122
@ HasAbs
Definition: GenericPacketMath.h:121
Packet8h half
Definition: PacketMathFP16.h:83
Eigen::half type
Definition: PacketMathFP16.h:82
Packet16h half
Definition: PacketMathFP16.h:70
Eigen::half type
Definition: PacketMathFP16.h:69
Packet8h half
Definition: PacketMathFP16.h:96
Eigen::half type
Definition: PacketMathFP16.h:95
Definition: GenericPacketMath.h:134
@ masked_load_available
Definition: GenericPacketMath.h:142
@ size
Definition: GenericPacketMath.h:139
@ masked_store_available
Definition: GenericPacketMath.h:143
@ vectorizable
Definition: GenericPacketMath.h:141
@ alignment
Definition: GenericPacketMath.h:140