HVX/PacketMath.h
Go to the documentation of this file.
1 
2 #ifndef EIGEN_HVX_PACKET_MATH_H
3 #define EIGEN_HVX_PACKET_MATH_H
4 
5 // Only support 128B HVX now.
6 // Floating-point operations are supported only since V68.
7 #if defined __HVX__ && (__HVX_LENGTH__ == 128) && __HVX_ARCH__ >= 68
8 
9 // All the floating-point operations do not support IEEE standard.
10 // From HVX document:
11 // There is no concept of infinity or NaN. QFloat saturates to maximum
12 // exponent with maximum positive or minimum negative significand.
13 
14 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
15 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
16 #endif
17 
18 namespace Eigen {
19 namespace internal {
20 
21 // HVX utilities.
22 
23 template <int D>
24 EIGEN_STRONG_INLINE HVX_Vector HVX_vmem(const void* m) {
25  HVX_Vector v;
26 #if EIGEN_COMP_CLANG
27  // Use inlined assembly for aligned vmem load on unaligned memory.
28  // Use type cast to HVX_Vector* may mess up with compiler data alignment.
29  __asm__("%0 = vmem(%1+#%2)" : "=v"(v) : "r"(m), "i"(D) : "memory");
30 #else
31  void* aligned_mem =
32  reinterpret_cast<void*>((reinterpret_cast<uintptr_t>(m) & ~(__HVX_LENGTH__ - 1)) + D * __HVX_LENGTH__);
33  memcpy(&v, aligned_mem, __HVX_LENGTH__);
34 #endif
35  return v;
36 }
37 
38 template <typename T>
39 EIGEN_STRONG_INLINE HVX_Vector HVX_load(const T* mem) {
40  HVX_Vector v;
41  memcpy(&v, reinterpret_cast<const HVX_Vector*>(mem), __HVX_LENGTH__);
42  return v;
43 }
44 
45 template <typename T>
46 EIGEN_STRONG_INLINE HVX_Vector HVX_loadu(const T* mem) {
47  HVX_Vector v;
48  memcpy(&v, mem, __HVX_LENGTH__);
49  return v;
50 }
51 
52 template <size_t Size, size_t Alignment, typename T>
53 EIGEN_STRONG_INLINE HVX_Vector HVX_load_partial(const T* mem) {
54 #if defined(EIGEN_HVX_FAST_PARTIAL_VECTOR_LOAD)
55  // Fast partial vector load through aligned vmem load.
56  // The load may past end of array but is aligned to prevent memory fault.
57  HVX_Vector v0 = HVX_vmem<0>(mem);
58  HVX_Vector v1 = v0;
59  uintptr_t mem_addr = reinterpret_cast<uintptr_t>(mem);
60  EIGEN_IF_CONSTEXPR(Size * sizeof(T) <= Alignment) {
61  // Data size less than alignment will never cross multiple aligned vectors.
62  v1 = v0;
63  }
64  else {
65  uintptr_t left_off = mem_addr & (__HVX_LENGTH__ - 1);
66  if (left_off + Size * sizeof(T) > __HVX_LENGTH__) {
67  v1 = HVX_vmem<1>(mem);
68  } else {
69  v1 = v0;
70  }
71  }
72  return Q6_V_valign_VVR(v1, v0, mem_addr);
73 #else
74  HVX_Vector v;
75  memcpy(&v, mem, Size * sizeof(T));
76  return v;
77 #endif
78 }
79 
80 template <typename T>
81 EIGEN_STRONG_INLINE void HVX_store(T* mem, HVX_Vector v) {
82  memcpy(reinterpret_cast<HVX_Vector*>(mem), &v, __HVX_LENGTH__);
83 }
84 
85 template <typename T>
86 EIGEN_STRONG_INLINE void HVX_storeu(T* mem, HVX_Vector v) {
87  memcpy(mem, &v, __HVX_LENGTH__);
88 }
89 
90 template <size_t Size, size_t Alignment, typename T>
91 EIGEN_STRONG_INLINE void HVX_store_partial(T* mem, HVX_Vector v) {
92  uintptr_t mem_addr = reinterpret_cast<uintptr_t>(mem);
93  HVX_Vector value = Q6_V_vlalign_VVR(v, v, mem_addr);
94  uintptr_t left_off = mem_addr & (__HVX_LENGTH__ - 1);
95  uintptr_t right_off = left_off + Size * sizeof(T);
96 
97  HVX_VectorPred ql_not = Q6_Q_vsetq_R(mem_addr);
98  HVX_VectorPred qr = Q6_Q_vsetq2_R(right_off);
99 
100  EIGEN_IF_CONSTEXPR(Size * sizeof(T) > Alignment) {
101  if (right_off > __HVX_LENGTH__) {
102  Q6_vmem_QRIV(qr, mem + __HVX_LENGTH__ / sizeof(T), value);
103  qr = Q6_Q_vcmp_eq_VbVb(value, value);
104  }
105  }
106 
107  ql_not = Q6_Q_or_QQn(ql_not, qr);
108  Q6_vmem_QnRIV(ql_not, mem, value);
109 }
110 
111 // Packet definitions.
112 enum class HVXPacketSize {
113  Full,
114  Half,
115  Quarter,
116 };
117 
118 // Hexagon compiler uses same HVX_Vector to represent all HVX vector types.
119 // Wrap different vector type (float32, int32, etc) to different class with
120 // explicit constructor and casting back-and-force to HVX_Vector.
121 template <HVXPacketSize T>
122 class HVXPacket {
123  public:
124  HVXPacket() = default;
125  static HVXPacket Create(HVX_Vector v) { return HVXPacket(v); }
126  HVX_Vector Get() const { return m_val; }
127 
128  private:
129  explicit HVXPacket(HVX_Vector v) : m_val(v) {}
130  HVX_Vector m_val = Q6_V_vzero();
131 };
132 
133 typedef HVXPacket<HVXPacketSize::Full> Packet32f;
134 typedef HVXPacket<HVXPacketSize::Half> Packet16f;
135 typedef HVXPacket<HVXPacketSize::Quarter> Packet8f;
136 
137 // Packet traits.
138 template <>
139 struct packet_traits<float> : default_packet_traits {
140  typedef Packet32f type;
141  typedef Packet16f half;
142  enum {
143  Vectorizable = 1,
144  AlignedOnScalar = 1,
145  size = 32,
146 
147  HasCmp = 1,
148  HasAdd = 1,
149  HasSub = 1,
150  HasShift = 0,
151  HasMul = 1,
152  HasNegate = 1,
153  HasAbs = 1,
154  HasArg = 0,
155  HasAbs2 = 0,
156  HasAbsDiff = 0,
157  HasMin = 1,
158  HasMax = 1,
159  HasConj = 0,
160  HasSetLinear = 0,
161  HasBlend = 0,
162 
163  HasDiv = 0,
164 
165  HasSin = 0,
166  HasCos = 0,
167  HasACos = 0,
168  HasASin = 0,
169  HasATan = 0,
170  HasATanh = 0,
171  HasLog = 0,
172  HasExp = 0,
173  HasSqrt = 0,
174  HasRsqrt = 0,
175  HasTanh = 0,
176  HasErf = 0,
177  HasBessel = 0,
178  HasNdtri = 0
179  };
180 };
181 
182 template <>
183 struct unpacket_traits<Packet32f> {
184  typedef float type;
185  typedef Packet16f half;
186  enum {
187  size = 32,
189  vectorizable = true,
190  masked_load_available = false,
191  masked_store_available = false
192  };
193 };
194 
195 template <>
196 struct unpacket_traits<Packet16f> {
197  typedef float type;
198  typedef Packet8f half;
199  enum {
200  size = 16,
201  // Many code assume alignment on packet size instead of following trait
202  // So we do not use Aligned128 to optimize aligned load/store,
204  vectorizable = true,
205  masked_load_available = false,
206  masked_store_available = false
207  };
208 };
209 
210 template <>
211 struct unpacket_traits<Packet8f> {
212  typedef float type;
213  typedef Packet8f half;
214  enum {
215  size = 8,
216  // Many code assume alignment on packet size instead of following trait
217  // So we do not use Aligned128 to optimize aligned load/store,
219  vectorizable = true,
220  masked_load_available = false,
221  masked_store_available = false
222  };
223 };
224 
225 // float32 operations.
226 template <HVXPacketSize T>
227 EIGEN_STRONG_INLINE HVXPacket<T> pzero_hvx(const HVXPacket<T>&) {
228  return HVXPacket<T>::Create(Q6_V_vzero());
229 }
230 template <>
231 EIGEN_STRONG_INLINE Packet32f pzero<Packet32f>(const Packet32f&) {
232  return pzero_hvx(Packet32f());
233 }
234 template <>
235 EIGEN_STRONG_INLINE Packet16f pzero<Packet16f>(const Packet16f&) {
236  return pzero_hvx(Packet16f());
237 }
238 template <>
239 EIGEN_STRONG_INLINE Packet8f pzero<Packet8f>(const Packet8f&) {
240  return pzero_hvx(Packet8f());
241 }
242 
243 template <HVXPacketSize T>
244 EIGEN_STRONG_INLINE typename unpacket_traits<HVXPacket<T>>::half predux_half_dowto4_hvx(const HVXPacket<T>& a) {
245  const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
246  return unpacket_traits<HVXPacket<T>>::half::Create(
247  Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_vror_VR(a.Get(), sizeof(float) * packet_size / 2), a.Get())));
248 }
249 template <>
251  return predux_half_dowto4_hvx(a);
252 }
253 template <>
255  return predux_half_dowto4_hvx(a);
256 }
257 
258 template <HVXPacketSize T>
259 EIGEN_STRONG_INLINE HVXPacket<T> pset1_hvx(const float& from) {
260  union {
261  float f;
262  int32_t i;
263  } u;
264  u.f = from;
265  return HVXPacket<T>::Create(Q6_V_vsplat_R(u.i));
266 }
267 template <>
268 EIGEN_STRONG_INLINE Packet32f pset1<Packet32f>(const float& from) {
269  return pset1_hvx<HVXPacketSize::Full>(from);
270 }
271 template <>
273  return pset1_hvx<HVXPacketSize::Half>(from);
274 }
275 template <>
276 EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float& from) {
277  return pset1_hvx<HVXPacketSize::Quarter>(from);
278 }
279 
280 template <>
281 EIGEN_STRONG_INLINE Packet32f pload<Packet32f>(const float* from) {
282  return Packet32f::Create(HVX_load(from));
283 }
284 template <>
286  return Packet16f::Create(
288 }
289 template <>
290 EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(const float* from) {
291  return Packet8f::Create(
293 }
294 
295 template <>
296 EIGEN_STRONG_INLINE Packet32f ploadu<Packet32f>(const float* from) {
297  return Packet32f::Create(HVX_loadu(from));
298 }
299 template <>
301  return Packet16f::Create(HVX_load_partial<unpacket_traits<Packet16f>::size, 0>(from));
302 }
303 template <>
304 EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from) {
305  return Packet8f::Create(HVX_load_partial<unpacket_traits<Packet8f>::size, 0>(from));
306 }
307 
308 template <>
309 EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet32f& from) {
310  HVX_store(to, from.Get());
311 }
312 template <>
313 EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet16f& from) {
315 }
316 template <>
317 EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet8f& from) {
319 }
320 
321 template <>
322 EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet32f& from) {
323  HVX_storeu(to, from.Get());
324 }
325 template <>
326 EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet16f& from) {
328 }
329 template <>
330 EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet8f& from) {
332 }
333 
334 template <HVXPacketSize T>
335 EIGEN_STRONG_INLINE HVXPacket<T> pmul_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
336  return HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a.Get(), b.Get())));
337 }
338 template <>
339 EIGEN_STRONG_INLINE Packet32f pmul<Packet32f>(const Packet32f& a, const Packet32f& b) {
340  return pmul_hvx(a, b);
341 }
342 template <>
344  return pmul_hvx(a, b);
345 }
346 template <>
348  return pmul_hvx(a, b);
349 }
350 
351 template <HVXPacketSize T>
352 EIGEN_STRONG_INLINE HVXPacket<T> padd_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
353  return HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(a.Get(), b.Get())));
354 }
355 template <>
356 EIGEN_STRONG_INLINE Packet32f padd<Packet32f>(const Packet32f& a, const Packet32f& b) {
357  return padd_hvx(a, b);
358 }
359 template <>
361  return padd_hvx(a, b);
362 }
363 template <>
365  return padd_hvx(a, b);
366 }
367 
368 template <HVXPacketSize T>
369 EIGEN_STRONG_INLINE HVXPacket<T> psub_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
370  return HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(a.Get(), b.Get())));
371 }
372 template <>
373 EIGEN_STRONG_INLINE Packet32f psub<Packet32f>(const Packet32f& a, const Packet32f& b) {
374  return psub_hvx(a, b);
375 }
376 template <>
378  return psub_hvx(a, b);
379 }
380 template <>
382  return psub_hvx(a, b);
383 }
384 
385 template <HVXPacketSize T>
386 EIGEN_STRONG_INLINE HVXPacket<T> pnegate_hvx(const HVXPacket<T>& a) {
387  return HVXPacket<T>::Create(a.Get() ^ Q6_V_vsplat_R(0x80000000));
388 }
389 template <>
390 EIGEN_STRONG_INLINE Packet32f pnegate(const Packet32f& a) {
391  return pnegate_hvx(a);
392 }
393 template <>
395  return pnegate_hvx(a);
396 }
397 template <>
399  return pnegate_hvx(a);
400 }
401 
402 template <HVXPacketSize T>
403 EIGEN_STRONG_INLINE HVXPacket<T> pcmp_le_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
404  HVX_Vector v_true = Q6_Vb_vsplat_R(0xff);
405  HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(a.Get(), b.Get());
406  return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, Q6_V_vzero(), v_true));
407 }
408 template <>
409 EIGEN_STRONG_INLINE Packet32f pcmp_le(const Packet32f& a, const Packet32f& b) {
410  return pcmp_le_hvx(a, b);
411 }
412 template <>
414  return pcmp_le_hvx(a, b);
415 }
416 template <>
418  return pcmp_le_hvx(a, b);
419 }
420 
421 template <HVXPacketSize T>
422 EIGEN_STRONG_INLINE HVXPacket<T> pcmp_eq_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
423  HVX_Vector v_true = Q6_Vb_vsplat_R(0xff);
424  HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(a.Get(), b.Get());
425  return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
426 }
427 template <>
428 EIGEN_STRONG_INLINE Packet32f pcmp_eq(const Packet32f& a, const Packet32f& b) {
429  return pcmp_eq_hvx(a, b);
430 }
431 template <>
433  return pcmp_eq_hvx(a, b);
434 }
435 template <>
437  return pcmp_eq_hvx(a, b);
438 }
439 
440 template <HVXPacketSize T>
441 EIGEN_STRONG_INLINE HVXPacket<T> pcmp_lt_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
442  HVX_Vector v_true = Q6_Vb_vsplat_R(0xff);
443  HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get());
444  return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
445 }
446 template <>
447 EIGEN_STRONG_INLINE Packet32f pcmp_lt(const Packet32f& a, const Packet32f& b) {
448  return pcmp_lt_hvx(a, b);
449 }
450 template <>
452  return pcmp_lt_hvx(a, b);
453 }
454 template <>
456  return pcmp_lt_hvx(a, b);
457 }
458 
459 template <HVXPacketSize T>
460 EIGEN_STRONG_INLINE HVXPacket<T> pcmp_lt_or_nan_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
461  HVX_Vector v_true = Q6_Vb_vsplat_R(0xff);
462  HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get());
463  return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
464 }
465 template <>
466 EIGEN_STRONG_INLINE Packet32f pcmp_lt_or_nan(const Packet32f& a, const Packet32f& b) {
467  return pcmp_lt_or_nan_hvx(a, b);
468 }
469 template <>
471  return pcmp_lt_or_nan_hvx(a, b);
472 }
473 template <>
475  return pcmp_lt_or_nan_hvx(a, b);
476 }
477 
478 template <HVXPacketSize T>
479 EIGEN_STRONG_INLINE HVXPacket<T> pabs_hvx(const HVXPacket<T>& a) {
480  return HVXPacket<T>::Create(a.Get() & Q6_V_vsplat_R(0x7FFFFFFF));
481 }
482 template <>
483 EIGEN_STRONG_INLINE Packet32f pabs(const Packet32f& a) {
484  return pabs_hvx(a);
485 }
486 template <>
488  return pabs_hvx(a);
489 }
490 template <>
492  return pabs_hvx(a);
493 }
494 
495 template <HVXPacketSize T>
496 EIGEN_STRONG_INLINE float pfirst_hvx(const HVXPacket<T>& a) {
497  union {
498  float array[1];
499  HVX_Vector vector;
500  } HVX_and_array;
501  HVX_and_array.vector = a.Get();
502  return HVX_and_array.array[0];
503 }
504 template <>
505 EIGEN_STRONG_INLINE float pfirst(const Packet32f& a) {
506  return pfirst_hvx(a);
507 }
508 template <>
509 EIGEN_STRONG_INLINE float pfirst(const Packet16f& a) {
510  return pfirst_hvx(a);
511 }
512 template <>
513 EIGEN_STRONG_INLINE float pfirst(const Packet8f& a) {
514  return pfirst_hvx(a);
515 }
516 
517 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet32f, 4>& kernel) {
518  // Shuffle the 32-bit lanes.
519  HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
520  HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
521 
522  // Shuffle the 64-bit lanes.
523  HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
524  HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_3_2), HEXAGON_HVX_GET_V1(v_0_1_0), -8);
525  kernel.packet[0] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
526  kernel.packet[1] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_1_1_0));
527  kernel.packet[2] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_1_3_2));
528  kernel.packet[3] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_1_3_2));
529 }
530 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16f, 4>& kernel) {
531  // Shuffle the 32-bit lanes.
532  HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
533  HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
534 
535  // Shuffle the 64-bit lanes.
536  HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
537 
538  kernel.packet[0] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
539  kernel.packet[1] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 64));
540  kernel.packet[2] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_1_0));
541  kernel.packet[3] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_1_0), HEXAGON_HVX_GET_V1(v_1_1_0), 64));
542 }
543 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8f, 4>& kernel) {
544  // Shuffle the 32-bit lanes.
545  HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
546  HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
547 
548  // Shuffle the 64-bit lanes.
549  HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
550 
551  kernel.packet[0] = Packet8f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
552  kernel.packet[1] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 32));
553  kernel.packet[2] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 64));
554  kernel.packet[3] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 96));
555 }
556 
557 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8f, 8>& kernel) {
558  // Shuffle the 32-bit lanes.
559  HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
560  HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
561  HVX_VectorPair v_0_5_4 = Q6_W_vshuff_VVR(kernel.packet[5].Get(), kernel.packet[4].Get(), -4);
562  HVX_VectorPair v_0_7_6 = Q6_W_vshuff_VVR(kernel.packet[7].Get(), kernel.packet[6].Get(), -4);
563 
564  // Shuffle the 64-bit lanes.
565  HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
566  HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_7_6), HEXAGON_HVX_GET_V0(v_0_5_4), -8);
567 
568  // Shuffle the 128-bit lanes.
569  v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_3_2), HEXAGON_HVX_GET_V0(v_1_1_0), -16);
570 
571  kernel.packet[0] = Packet8f::Create(HEXAGON_HVX_GET_V0(v_0_1_0));
572  kernel.packet[1] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_0_1_0), HEXAGON_HVX_GET_V0(v_0_1_0), 32));
573  kernel.packet[2] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_0_1_0), HEXAGON_HVX_GET_V0(v_0_1_0), 64));
574  kernel.packet[3] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_0_1_0), HEXAGON_HVX_GET_V0(v_0_1_0), 96));
575  kernel.packet[4] = Packet8f::Create(HEXAGON_HVX_GET_V1(v_0_1_0));
576  kernel.packet[5] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_0_1_0), HEXAGON_HVX_GET_V1(v_0_1_0), 32));
577  kernel.packet[6] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_0_1_0), HEXAGON_HVX_GET_V1(v_0_1_0), 64));
578  kernel.packet[7] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_0_1_0), HEXAGON_HVX_GET_V1(v_0_1_0), 96));
579 }
580 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16f, 16>& kernel) {
581  // Shuffle the 32-bit lanes.
582  HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
583  HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
584  HVX_VectorPair v_0_5_4 = Q6_W_vshuff_VVR(kernel.packet[5].Get(), kernel.packet[4].Get(), -4);
585  HVX_VectorPair v_0_7_6 = Q6_W_vshuff_VVR(kernel.packet[7].Get(), kernel.packet[6].Get(), -4);
586  HVX_VectorPair v_0_9_8 = Q6_W_vshuff_VVR(kernel.packet[9].Get(), kernel.packet[8].Get(), -4);
587  HVX_VectorPair v_0_11_10 = Q6_W_vshuff_VVR(kernel.packet[11].Get(), kernel.packet[10].Get(), -4);
588  HVX_VectorPair v_0_13_12 = Q6_W_vshuff_VVR(kernel.packet[13].Get(), kernel.packet[12].Get(), -4);
589  HVX_VectorPair v_0_15_14 = Q6_W_vshuff_VVR(kernel.packet[15].Get(), kernel.packet[14].Get(), -4);
590 
591  // Shuffle the 64-bit lanes.
592  HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
593  HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_7_6), HEXAGON_HVX_GET_V0(v_0_5_4), -8);
594  HVX_VectorPair v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_9_8), -8);
595  HVX_VectorPair v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_15_14), HEXAGON_HVX_GET_V0(v_0_13_12), -8);
596 
597  // Shuffle the 128-bit lanes.
598  v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_3_2), HEXAGON_HVX_GET_V0(v_1_1_0), -16);
599  v_0_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_3_2), HEXAGON_HVX_GET_V1(v_1_1_0), -16);
600  v_0_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_7_6), HEXAGON_HVX_GET_V0(v_1_5_4), -16);
601  v_0_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_7_6), HEXAGON_HVX_GET_V1(v_1_5_4), -16);
602 
603  // Shuffle the 256-bit lanes.
604  v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_9_8), HEXAGON_HVX_GET_V0(v_0_1_0), -32);
605  v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_9_8), HEXAGON_HVX_GET_V1(v_0_1_0), -32);
606  v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_3_2), -32);
607  v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_11_10), HEXAGON_HVX_GET_V1(v_0_3_2), -32);
608 
609  kernel.packet[0] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
610  kernel.packet[1] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 64));
611  kernel.packet[2] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_1_0));
612  kernel.packet[3] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_1_0), HEXAGON_HVX_GET_V1(v_1_1_0), 64));
613  kernel.packet[4] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_3_2));
614  kernel.packet[5] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_3_2), HEXAGON_HVX_GET_V0(v_1_3_2), 64));
615  kernel.packet[6] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_3_2));
616  kernel.packet[7] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_3_2), HEXAGON_HVX_GET_V1(v_1_3_2), 64));
617  kernel.packet[8] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_5_4));
618  kernel.packet[9] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_5_4), HEXAGON_HVX_GET_V0(v_1_5_4), 64));
619  kernel.packet[10] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_5_4));
620  kernel.packet[11] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_5_4), HEXAGON_HVX_GET_V1(v_1_5_4), 64));
621  kernel.packet[12] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_7_6));
622  kernel.packet[13] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_7_6), HEXAGON_HVX_GET_V0(v_1_7_6), 64));
623  kernel.packet[14] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_7_6));
624  kernel.packet[15] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_7_6), HEXAGON_HVX_GET_V1(v_1_7_6), 64));
625 }
626 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet32f, 32>& kernel) {
627  // Shuffle the 32-bit lanes.
628  HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
629  HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
630  HVX_VectorPair v_0_5_4 = Q6_W_vshuff_VVR(kernel.packet[5].Get(), kernel.packet[4].Get(), -4);
631  HVX_VectorPair v_0_7_6 = Q6_W_vshuff_VVR(kernel.packet[7].Get(), kernel.packet[6].Get(), -4);
632  HVX_VectorPair v_0_9_8 = Q6_W_vshuff_VVR(kernel.packet[9].Get(), kernel.packet[8].Get(), -4);
633  HVX_VectorPair v_0_11_10 = Q6_W_vshuff_VVR(kernel.packet[11].Get(), kernel.packet[10].Get(), -4);
634  HVX_VectorPair v_0_13_12 = Q6_W_vshuff_VVR(kernel.packet[13].Get(), kernel.packet[12].Get(), -4);
635  HVX_VectorPair v_0_15_14 = Q6_W_vshuff_VVR(kernel.packet[15].Get(), kernel.packet[14].Get(), -4);
636  HVX_VectorPair v_0_17_16 = Q6_W_vshuff_VVR(kernel.packet[17].Get(), kernel.packet[16].Get(), -4);
637  HVX_VectorPair v_0_19_18 = Q6_W_vshuff_VVR(kernel.packet[19].Get(), kernel.packet[18].Get(), -4);
638  HVX_VectorPair v_0_21_20 = Q6_W_vshuff_VVR(kernel.packet[21].Get(), kernel.packet[20].Get(), -4);
639  HVX_VectorPair v_0_23_22 = Q6_W_vshuff_VVR(kernel.packet[23].Get(), kernel.packet[22].Get(), -4);
640  HVX_VectorPair v_0_25_24 = Q6_W_vshuff_VVR(kernel.packet[25].Get(), kernel.packet[24].Get(), -4);
641  HVX_VectorPair v_0_27_26 = Q6_W_vshuff_VVR(kernel.packet[27].Get(), kernel.packet[26].Get(), -4);
642  HVX_VectorPair v_0_29_28 = Q6_W_vshuff_VVR(kernel.packet[29].Get(), kernel.packet[28].Get(), -4);
643  HVX_VectorPair v_0_31_30 = Q6_W_vshuff_VVR(kernel.packet[31].Get(), kernel.packet[30].Get(), -4);
644 
645  // Shuffle the 64-bit lanes.
646  HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
647  HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_3_2), HEXAGON_HVX_GET_V1(v_0_1_0), -8);
648  HVX_VectorPair v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_7_6), HEXAGON_HVX_GET_V0(v_0_5_4), -8);
649  HVX_VectorPair v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_7_6), HEXAGON_HVX_GET_V1(v_0_5_4), -8);
650  HVX_VectorPair v_1_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_9_8), -8);
651  HVX_VectorPair v_1_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_11_10), HEXAGON_HVX_GET_V1(v_0_9_8), -8);
652  HVX_VectorPair v_1_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_15_14), HEXAGON_HVX_GET_V0(v_0_13_12), -8);
653  HVX_VectorPair v_1_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_15_14), HEXAGON_HVX_GET_V1(v_0_13_12), -8);
654  HVX_VectorPair v_1_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_19_18), HEXAGON_HVX_GET_V0(v_0_17_16), -8);
655  HVX_VectorPair v_1_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_19_18), HEXAGON_HVX_GET_V1(v_0_17_16), -8);
656  HVX_VectorPair v_1_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_23_22), HEXAGON_HVX_GET_V0(v_0_21_20), -8);
657  HVX_VectorPair v_1_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_23_22), HEXAGON_HVX_GET_V1(v_0_21_20), -8);
658  HVX_VectorPair v_1_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_27_26), HEXAGON_HVX_GET_V0(v_0_25_24), -8);
659  HVX_VectorPair v_1_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_27_26), HEXAGON_HVX_GET_V1(v_0_25_24), -8);
660  HVX_VectorPair v_1_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_31_30), HEXAGON_HVX_GET_V0(v_0_29_28), -8);
661  HVX_VectorPair v_1_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_31_30), HEXAGON_HVX_GET_V1(v_0_29_28), -8);
662 
663  // Shuffle the 128-bit lanes.
664  v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_5_4), HEXAGON_HVX_GET_V0(v_1_1_0), -16);
665  v_0_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_5_4), HEXAGON_HVX_GET_V1(v_1_1_0), -16);
666  v_0_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_7_6), HEXAGON_HVX_GET_V0(v_1_3_2), -16);
667  v_0_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_7_6), HEXAGON_HVX_GET_V1(v_1_3_2), -16);
668  v_0_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_13_12), HEXAGON_HVX_GET_V0(v_1_9_8), -16);
669  v_0_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_13_12), HEXAGON_HVX_GET_V1(v_1_9_8), -16);
670  v_0_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_15_14), HEXAGON_HVX_GET_V0(v_1_11_10), -16);
671  v_0_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_15_14), HEXAGON_HVX_GET_V1(v_1_11_10), -16);
672  v_0_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_21_20), HEXAGON_HVX_GET_V0(v_1_17_16), -16);
673  v_0_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_21_20), HEXAGON_HVX_GET_V1(v_1_17_16), -16);
674  v_0_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_23_22), HEXAGON_HVX_GET_V0(v_1_19_18), -16);
675  v_0_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_23_22), HEXAGON_HVX_GET_V1(v_1_19_18), -16);
676  v_0_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_29_28), HEXAGON_HVX_GET_V0(v_1_25_24), -16);
677  v_0_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_29_28), HEXAGON_HVX_GET_V1(v_1_25_24), -16);
678  v_0_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_31_30), HEXAGON_HVX_GET_V0(v_1_27_26), -16);
679  v_0_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_31_30), HEXAGON_HVX_GET_V1(v_1_27_26), -16);
680 
681  // Shuffle the 256-bit lanes.
682  v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_9_8), HEXAGON_HVX_GET_V0(v_0_1_0), -32);
683  v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_9_8), HEXAGON_HVX_GET_V1(v_0_1_0), -32);
684  v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_3_2), -32);
685  v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_11_10), HEXAGON_HVX_GET_V1(v_0_3_2), -32);
686  v_1_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_13_12), HEXAGON_HVX_GET_V0(v_0_5_4), -32);
687  v_1_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_13_12), HEXAGON_HVX_GET_V1(v_0_5_4), -32);
688  v_1_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_15_14), HEXAGON_HVX_GET_V0(v_0_7_6), -32);
689  v_1_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_15_14), HEXAGON_HVX_GET_V1(v_0_7_6), -32);
690  v_1_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_25_24), HEXAGON_HVX_GET_V0(v_0_17_16), -32);
691  v_1_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_25_24), HEXAGON_HVX_GET_V1(v_0_17_16), -32);
692  v_1_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_27_26), HEXAGON_HVX_GET_V0(v_0_19_18), -32);
693  v_1_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_27_26), HEXAGON_HVX_GET_V1(v_0_19_18), -32);
694  v_1_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_29_28), HEXAGON_HVX_GET_V0(v_0_21_20), -32);
695  v_1_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_29_28), HEXAGON_HVX_GET_V1(v_0_21_20), -32);
696  v_1_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_31_30), HEXAGON_HVX_GET_V0(v_0_23_22), -32);
697  v_1_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_31_30), HEXAGON_HVX_GET_V1(v_0_23_22), -32);
698 
699  // Shuffle the 512-bit lanes.
700  v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_17_16), HEXAGON_HVX_GET_V0(v_1_1_0), -64);
701  v_0_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_17_16), HEXAGON_HVX_GET_V1(v_1_1_0), -64);
702  v_0_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_19_18), HEXAGON_HVX_GET_V0(v_1_3_2), -64);
703  v_0_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_19_18), HEXAGON_HVX_GET_V1(v_1_3_2), -64);
704  v_0_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_21_20), HEXAGON_HVX_GET_V0(v_1_5_4), -64);
705  v_0_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_21_20), HEXAGON_HVX_GET_V1(v_1_5_4), -64);
706  v_0_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_23_22), HEXAGON_HVX_GET_V0(v_1_7_6), -64);
707  v_0_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_23_22), HEXAGON_HVX_GET_V1(v_1_7_6), -64);
708  v_0_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_25_24), HEXAGON_HVX_GET_V0(v_1_9_8), -64);
709  v_0_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_25_24), HEXAGON_HVX_GET_V1(v_1_9_8), -64);
710  v_0_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_27_26), HEXAGON_HVX_GET_V0(v_1_11_10), -64);
711  v_0_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_27_26), HEXAGON_HVX_GET_V1(v_1_11_10), -64);
712  v_0_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_29_28), HEXAGON_HVX_GET_V0(v_1_13_12), -64);
713  v_0_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_29_28), HEXAGON_HVX_GET_V1(v_1_13_12), -64);
714  v_0_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_31_30), HEXAGON_HVX_GET_V0(v_1_15_14), -64);
715  v_0_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_31_30), HEXAGON_HVX_GET_V1(v_1_15_14), -64);
716 
717  kernel.packet[0] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_1_0));
718  kernel.packet[1] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_1_0));
719  kernel.packet[2] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_3_2));
720  kernel.packet[3] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_3_2));
721  kernel.packet[4] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_5_4));
722  kernel.packet[5] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_5_4));
723  kernel.packet[6] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_7_6));
724  kernel.packet[7] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_7_6));
725  kernel.packet[8] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_9_8));
726  kernel.packet[9] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_9_8));
727  kernel.packet[10] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_11_10));
728  kernel.packet[11] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_11_10));
729  kernel.packet[12] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_13_12));
730  kernel.packet[13] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_13_12));
731  kernel.packet[14] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_15_14));
732  kernel.packet[15] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_15_14));
733  kernel.packet[16] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_17_16));
734  kernel.packet[17] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_17_16));
735  kernel.packet[18] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_19_18));
736  kernel.packet[19] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_19_18));
737  kernel.packet[20] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_21_20));
738  kernel.packet[21] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_21_20));
739  kernel.packet[22] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_23_22));
740  kernel.packet[23] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_23_22));
741  kernel.packet[24] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_25_24));
742  kernel.packet[25] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_25_24));
743  kernel.packet[26] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_27_26));
744  kernel.packet[27] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_27_26));
745  kernel.packet[28] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_29_28));
746  kernel.packet[29] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_29_28));
747  kernel.packet[30] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_31_30));
748  kernel.packet[31] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_31_30));
749 }
750 
751 template <HVXPacketSize T>
752 EIGEN_STRONG_INLINE float predux_hvx(const HVXPacket<T>& a) {
753  const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
754  HVX_Vector vsum = Q6_Vqf32_vadd_VsfVsf(a.Get(), Q6_V_vror_VR(a.Get(), sizeof(float)));
755  for (int i = 2; i < packet_size; i <<= 1) {
756  vsum = Q6_Vqf32_vadd_Vqf32Vqf32(vsum, Q6_V_vror_VR(vsum, i * sizeof(float)));
757  }
758  return pfirst(HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(vsum)));
759 }
760 template <>
761 EIGEN_STRONG_INLINE float predux<Packet32f>(const Packet32f& a) {
762  return predux_hvx(a);
763 }
764 template <>
766  return predux_hvx(a);
767 }
768 template <>
770  return predux_hvx(a);
771 }
772 
773 template <HVXPacketSize T>
774 EIGEN_STRONG_INLINE HVXPacket<T> ploaddup_hvx(const float* from) {
775  constexpr Index size = unpacket_traits<HVXPacket<T>>::size / 2;
776  HVX_Vector load = HVX_load_partial<size, 0>(from);
777  HVX_VectorPair dup = Q6_W_vshuff_VVR(load, load, -4);
778  return HVXPacket<T>::Create(HEXAGON_HVX_GET_V0(dup));
779 }
780 template <>
781 EIGEN_STRONG_INLINE Packet32f ploaddup(const float* from) {
782  return ploaddup_hvx<HVXPacketSize::Full>(from);
783 }
784 template <>
785 EIGEN_STRONG_INLINE Packet16f ploaddup(const float* from) {
786  return ploaddup_hvx<HVXPacketSize::Half>(from);
787 }
788 template <>
789 EIGEN_STRONG_INLINE Packet8f ploaddup(const float* from) {
790  return ploaddup_hvx<HVXPacketSize::Quarter>(from);
791 }
792 
793 template <HVXPacketSize T>
794 EIGEN_STRONG_INLINE HVXPacket<T> ploadquad_hvx(const float* from) {
795  constexpr Index size = unpacket_traits<HVXPacket<T>>::size / 4;
796  HVX_Vector load = HVX_load_partial<size, 0>(from);
797  HVX_VectorPair dup = Q6_W_vshuff_VVR(load, load, -4);
798  HVX_VectorPair quad = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(dup), HEXAGON_HVX_GET_V0(dup), -8);
799  return HVXPacket<T>::Create(HEXAGON_HVX_GET_V0(quad));
800 }
801 template <>
802 EIGEN_STRONG_INLINE Packet32f ploadquad(const float* from) {
803  return ploadquad_hvx<HVXPacketSize::Full>(from);
804 }
805 template <>
806 EIGEN_STRONG_INLINE Packet16f ploadquad(const float* from) {
807  return ploadquad_hvx<HVXPacketSize::Half>(from);
808 }
809 template <>
810 EIGEN_STRONG_INLINE Packet8f ploadquad(const float* from) {
811  return ploadquad_hvx<HVXPacketSize::Quarter>(from);
812 }
813 
814 template <>
815 EIGEN_STRONG_INLINE Packet32f preverse(const Packet32f& a) {
816  HVX_Vector delta = Q6_Vb_vsplat_R(0x7c);
817  return Packet32f::Create(Q6_V_vdelta_VV(a.Get(), delta));
818 }
819 
820 template <>
822  HVX_Vector delta = Q6_Vb_vsplat_R(0x3c);
823  return Packet16f::Create(Q6_V_vdelta_VV(a.Get(), delta));
824 }
825 
826 template <>
828  HVX_Vector delta = Q6_Vb_vsplat_R(0x1c);
829  return Packet8f::Create(Q6_V_vdelta_VV(a.Get(), delta));
830 }
831 
832 template <HVXPacketSize T>
833 EIGEN_STRONG_INLINE HVXPacket<T> pmin_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
834  return HVXPacket<T>::Create(Q6_Vsf_vmin_VsfVsf(a.Get(), b.Get()));
835 }
836 template <>
837 EIGEN_STRONG_INLINE Packet32f pmin(const Packet32f& a, const Packet32f& b) {
838  return pmin_hvx(a, b);
839 }
840 template <>
842  return pmin_hvx(a, b);
843 }
844 template <>
846  return pmin_hvx(a, b);
847 }
848 
849 template <HVXPacketSize T>
850 EIGEN_STRONG_INLINE HVXPacket<T> pmax_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
851  return HVXPacket<T>::Create(Q6_Vsf_vmax_VsfVsf(a.Get(), b.Get()));
852 }
853 template <>
854 EIGEN_STRONG_INLINE Packet32f pmax(const Packet32f& a, const Packet32f& b) {
855  return pmax_hvx(a, b);
856 }
857 template <>
859  return pmax_hvx(a, b);
860 }
861 template <>
863  return pmax_hvx(a, b);
864 }
865 
866 template <HVXPacketSize T>
867 EIGEN_STRONG_INLINE HVXPacket<T> pand_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
868  return HVXPacket<T>::Create(a.Get() & b.Get());
869 }
870 template <>
871 EIGEN_STRONG_INLINE Packet32f pand(const Packet32f& a, const Packet32f& b) {
872  return pand_hvx(a, b);
873 }
874 template <>
876  return pand_hvx(a, b);
877 }
878 template <>
880  return pand_hvx(a, b);
881 }
882 
883 template <HVXPacketSize T>
884 EIGEN_STRONG_INLINE HVXPacket<T> por_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
885  return HVXPacket<T>::Create(a.Get() | b.Get());
886 }
887 template <>
888 EIGEN_STRONG_INLINE Packet32f por(const Packet32f& a, const Packet32f& b) {
889  return por_hvx(a, b);
890 }
891 template <>
893  return por_hvx(a, b);
894 }
895 template <>
897  return por_hvx(a, b);
898 }
899 
900 template <HVXPacketSize T>
901 EIGEN_STRONG_INLINE HVXPacket<T> pxor_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
902  return HVXPacket<T>::Create(a.Get() ^ b.Get());
903 }
904 template <>
905 EIGEN_STRONG_INLINE Packet32f pxor(const Packet32f& a, const Packet32f& b) {
906  return pxor_hvx(a, b);
907 }
908 template <>
910  return pxor_hvx(a, b);
911 }
912 template <>
914  return pxor_hvx(a, b);
915 }
916 
917 template <HVXPacketSize T>
918 EIGEN_STRONG_INLINE HVXPacket<T> pnot_hvx(const HVXPacket<T>& a) {
919  return HVXPacket<T>::Create(~a.Get());
920 }
921 template <>
922 EIGEN_STRONG_INLINE Packet32f pnot(const Packet32f& a) {
923  return pnot_hvx(a);
924 }
925 template <>
927  return pnot_hvx(a);
928 }
929 template <>
931  return pnot_hvx(a);
932 }
933 
934 template <HVXPacketSize T>
935 EIGEN_STRONG_INLINE HVXPacket<T> pselect_hvx(const HVXPacket<T>& mask, const HVXPacket<T>& a, const HVXPacket<T>& b) {
936  HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(mask.Get(), Q6_V_vzero());
937  return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, b.Get(), a.Get()));
938 }
939 template <>
940 EIGEN_STRONG_INLINE Packet32f pselect(const Packet32f& mask, const Packet32f& a, const Packet32f& b) {
941  return pselect_hvx(mask, a, b);
942 }
943 template <>
944 EIGEN_STRONG_INLINE Packet16f pselect(const Packet16f& mask, const Packet16f& a, const Packet16f& b) {
945  return pselect_hvx(mask, a, b);
946 }
947 template <>
948 EIGEN_STRONG_INLINE Packet8f pselect(const Packet8f& mask, const Packet8f& a, const Packet8f& b) {
949  return pselect_hvx(mask, a, b);
950 }
951 
952 template <HVXPacketSize T, typename Op>
953 EIGEN_STRONG_INLINE float predux_generic(const HVXPacket<T>& a, Op op) {
954  const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
955  HVXPacket<T> vredux = a;
956  for (int i = 1; i < packet_size; i <<= 1) {
957  vredux = op(vredux, HVXPacket<T>::Create(Q6_V_vror_VR(vredux.Get(), i * sizeof(float))));
958  }
959  return pfirst(vredux);
960 }
961 
962 template <>
963 EIGEN_STRONG_INLINE float predux_max(const Packet32f& a) {
964  return predux_generic(a, pmax<Packet32f>);
965 }
966 template <>
968  return predux_generic(a, pmax<Packet16f>);
969 }
970 template <>
972  return predux_generic(a, pmax<Packet8f>);
973 }
974 
975 template <>
976 EIGEN_STRONG_INLINE float predux_min(const Packet32f& a) {
977  return predux_generic(a, pmin<Packet32f>);
978 }
979 template <>
981  return predux_generic(a, pmin<Packet16f>);
982 }
983 template <>
985  return predux_generic(a, pmin<Packet8f>);
986 }
987 
988 template <>
989 EIGEN_STRONG_INLINE bool predux_any(const Packet32f& a) {
990  return predux_generic(a, por<Packet32f>) != 0.0f;
991 }
992 template <>
994  return predux_generic(a, por<Packet16f>) != 0.0f;
995 }
996 template <>
998  return predux_generic(a, por<Packet8f>) != 0.0f;
999 }
1000 
1001 static const float index_vsf[32]
1002  __attribute__((aligned(__HVX_LENGTH__))) = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1003  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
1004 
1005 template <HVXPacketSize T>
1006 EIGEN_STRONG_INLINE HVXPacket<T> plset_hvx(const float& a) {
1007  return padd(pload<HVXPacket<T>>(index_vsf), pset1<HVXPacket<T>>(a));
1008 }
1009 template <>
1010 EIGEN_STRONG_INLINE Packet32f plset(const float& a) {
1011  return plset_hvx<HVXPacketSize::Full>(a);
1012 }
1013 template <>
1014 EIGEN_STRONG_INLINE Packet16f plset(const float& a) {
1015  return plset_hvx<HVXPacketSize::Half>(a);
1016 }
1017 template <>
1018 EIGEN_STRONG_INLINE Packet8f plset(const float& a) {
1019  return plset_hvx<HVXPacketSize::Quarter>(a);
1020 }
1021 
1022 template <HVXPacketSize T>
1023 EIGEN_STRONG_INLINE void pscatter_hvx(float* to, const HVXPacket<T>& from, Index stride) {
1024  const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
1025  float elements[packet_size] __attribute__((aligned(__HVX_LENGTH__)));
1026  pstore<float>(elements, from);
1027  for (Index i = 0; i < packet_size; ++i) {
1028  to[i * stride] = elements[i];
1029  }
1030 }
1031 template <>
1032 EIGEN_STRONG_INLINE void pscatter<float, Packet32f>(float* to, const Packet32f& from, Index stride) {
1033  pscatter_hvx(to, from, stride);
1034 }
1035 template <>
1036 EIGEN_STRONG_INLINE void pscatter<float, Packet16f>(float* to, const Packet16f& from, Index stride) {
1037  pscatter_hvx(to, from, stride);
1038 }
1039 template <>
1040 EIGEN_STRONG_INLINE void pscatter<float, Packet8f>(float* to, const Packet8f& from, Index stride) {
1041  pscatter_hvx(to, from, stride);
1042 }
1043 
1044 template <HVXPacketSize T>
1045 EIGEN_STRONG_INLINE HVXPacket<T> pgather_hvx(const float* from, Index stride) {
1046  const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
1047  float elements[packet_size] __attribute__((aligned(__HVX_LENGTH__)));
1048  for (Index i = 0; i < packet_size; i++) {
1049  elements[i] = from[i * stride];
1050  }
1051  return pload<HVXPacket<T>>(elements);
1052 }
1053 template <>
1054 EIGEN_STRONG_INLINE Packet32f pgather<float, Packet32f>(const float* from, Index stride) {
1055  return pgather_hvx<HVXPacketSize::Full>(from, stride);
1056 }
1057 template <>
1058 EIGEN_STRONG_INLINE Packet16f pgather<float, Packet16f>(const float* from, Index stride) {
1059  return pgather_hvx<HVXPacketSize::Half>(from, stride);
1060 }
1061 template <>
1062 EIGEN_STRONG_INLINE Packet8f pgather<float, Packet8f>(const float* from, Index stride) {
1063  return pgather_hvx<HVXPacketSize::Quarter>(from, stride);
1064 }
1065 
1066 } // end namespace internal
1067 } // end namespace Eigen
1068 
1069 #endif // __HVX__ && (__HVX_LENGTH__ == 128) && __HVX_ARCH__ >= 68
1070 
1071 #endif // EIGEN_HVX_PACKET_MATH_H
Array< int, Dynamic, 1 > v
Definition: Array_initializer_list_vector_cxx11.cpp:1
int i
Definition: BiCGSTAB_step_by_step.cpp:9
dominoes D
Definition: Domino.cpp:55
Eigen::Triplet< double > T
Definition: EigenUnitTest.cpp:11
HouseholderQR< MatrixXf > qr(A)
#define EIGEN_IF_CONSTEXPR(X)
Definition: Macros.h:1306
#define EIGEN_STRONG_INLINE
Definition: Macros.h:834
void load(Archive &ar, ParticleHandler &handl)
Definition: Particles.h:21
M1<< 1, 2, 3, 4, 5, 6, 7, 8, 9;Map< RowVectorXf > v1(M1.data(), M1.size())
Scalar Scalar int size
Definition: benchVecAdd.cpp:17
Scalar * b
Definition: benchVecAdd.cpp:17
static int f(const TensorMap< Tensor< int, 3 > > &tensor)
Definition: cxx11_tensor_map.cpp:237
@ Aligned64
Definition: Constants.h:239
@ Aligned128
Definition: Constants.h:240
@ Aligned32
Definition: Constants.h:238
const Scalar * a
Definition: level2_cplx_impl.h:32
int * m
Definition: level2_cplx_impl.h:294
char char * op
Definition: level2_impl.h:374
EIGEN_DEVICE_FUNC Packet16f pgather< float, Packet16f >(const Packet16f &src, const float *from, Index stride, uint16_t umask)
Definition: AVX512/PacketMath.h:1141
EIGEN_DEVICE_FUNC Packet padd(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:318
EIGEN_STRONG_INLINE Packet16f padd< Packet16f >(const Packet16f &a, const Packet16f &b)
Definition: AVX512/PacketMath.h:355
EIGEN_STRONG_INLINE Packet8f pmax< Packet8f >(const Packet8f &a, const Packet8f &b)
Definition: AVX/PacketMath.h:1147
EIGEN_STRONG_INLINE Packet16h ploadquad(const Eigen::half *from)
Definition: AVX512/PacketMath.h:2250
EIGEN_STRONG_INLINE Packet16f por< Packet16f >(const Packet16f &a, const Packet16f &b)
Definition: AVX512/PacketMath.h:851
EIGEN_STRONG_INLINE void ptranspose(PacketBlock< Packet2cf, 2 > &kernel)
Definition: AltiVec/Complex.h:339
EIGEN_STRONG_INLINE bool predux_any(const Packet4f &x)
Definition: AltiVec/PacketMath.h:2751
EIGEN_STRONG_INLINE Packet16f ploadu< Packet16f >(const float *from)
Definition: AVX512/PacketMath.h:986
EIGEN_STRONG_INLINE Packet8f psub< Packet8f >(const Packet8f &a, const Packet8f &b)
Definition: AVX/PacketMath.h:870
EIGEN_STRONG_INLINE Packet16f psub< Packet16f >(const Packet16f &a, const Packet16f &b)
Definition: AVX512/PacketMath.h:383
EIGEN_DEVICE_FUNC unpacket_traits< Packet >::type predux_max(const Packet &a)
Definition: GenericPacketMath.h:1258
EIGEN_STRONG_INLINE Packet8f pmin< Packet8f >(const Packet8f &a, const Packet8f &b)
Definition: AVX/PacketMath.h:1099
EIGEN_STRONG_INLINE Packet8f padd< Packet8f >(const Packet8f &a, const Packet8f &b)
Definition: AVX/PacketMath.h:817
EIGEN_DEVICE_FUNC Packet pmax(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:663
EIGEN_DEVICE_FUNC Packet pnot(const Packet &a)
Definition: GenericPacketMath.h:572
EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f &a, const Packet4f &b)
Definition: AltiVec/PacketMath.h:1314
EIGEN_DEVICE_FUNC unpacket_traits< Packet >::type predux_min(const Packet &a)
Definition: GenericPacketMath.h:1245
EIGEN_STRONG_INLINE Packet8h por(const Packet8h &a, const Packet8h &b)
Definition: AVX/PacketMath.h:2309
EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i &a, const Packet4i &b)
Definition: AltiVec/PacketMath.h:1341
EIGEN_STRONG_INLINE float predux< Packet8f >(const Packet8f &a)
Definition: AVX/PacketMath.h:1954
EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf &a)
Definition: AltiVec/Complex.h:303
EIGEN_STRONG_INLINE Packet8f ploadu< Packet8f >(const float *from)
Definition: AVX/PacketMath.h:1507
EIGEN_STRONG_INLINE float predux< Packet16f >(const Packet16f &a)
Definition: AVX512/PacketMath.h:1456
EIGEN_STRONG_INLINE Packet16f pset1< Packet16f >(const float &from)
Definition: AVX512/PacketMath.h:252
EIGEN_STRONG_INLINE Packet16f pmax< Packet16f >(const Packet16f &a, const Packet16f &b)
Definition: AVX512/PacketMath.h:562
EIGEN_DEVICE_FUNC Packet pmin(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:649
EIGEN_STRONG_INLINE Packet8f pload< Packet8f >(const float *from)
Definition: AVX/PacketMath.h:1490
EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf &a)
Definition: AltiVec/Complex.h:264
EIGEN_STRONG_INLINE Packet16f pmul< Packet16f >(const Packet16f &a, const Packet16f &b)
Definition: AVX512/PacketMath.h:443
EIGEN_DEVICE_FUNC void pscatter< float, Packet16f >(float *to, const Packet16f &from, Index stride, uint16_t umask)
Definition: AVX512/PacketMath.h:1197
EIGEN_STRONG_INLINE Packet8f por< Packet8f >(const Packet8f &a, const Packet8f &b)
Definition: AVX/PacketMath.h:1327
EIGEN_STRONG_INLINE Packet16f pload< Packet16f >(const float *from)
Definition: AVX512/PacketMath.h:969
EIGEN_STRONG_INLINE void pstore< float >(float *to, const Packet4f &from)
Definition: AltiVec/PacketMath.h:642
EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f &a)
Definition: AltiVec/PacketMath.h:1936
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet ploaddup(const typename unpacket_traits< Packet >::type *from)
Definition: GenericPacketMath.h:824
EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet8bf &a)
Definition: AltiVec/PacketMath.h:2418
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c predux_half_dowto4(const Packet8c &a)
Definition: NEON/PacketMath.h:3635
EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf &a, const Packet2cf &b)
Definition: AltiVec/Complex.h:353
EIGEN_STRONG_INLINE Packet8h pand(const Packet8h &a, const Packet8h &b)
Definition: AVX/PacketMath.h:2319
EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h &a, const Packet8h &b)
Definition: AVX/PacketMath.h:2315
EIGEN_STRONG_INLINE Packet8f pmul< Packet8f >(const Packet8f &a, const Packet8f &b)
Definition: AVX/PacketMath.h:927
EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f &mask, const Packet4f &a, const Packet4f &b)
Definition: AltiVec/PacketMath.h:1474
EIGEN_DEVICE_FUNC Packet8f pgather< float, Packet8f >(const float *from, Index stride)
Definition: AVX/PacketMath.h:1668
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet plset(const typename unpacket_traits< Packet >::type &a)
Returns a packet with coefficients (a,a+1,...,a+packet_size-1).
Definition: GenericPacketMath.h:872
svint32_t PacketXi __attribute__((arm_sve_vector_bits(EIGEN_ARM64_SVE_VL)))
Definition: SVE/PacketMath.h:34
EIGEN_STRONG_INLINE Packet16f pmin< Packet16f >(const Packet16f &a, const Packet16f &b)
Definition: AVX512/PacketMath.h:543
EIGEN_DEVICE_FUNC Packet pload(const typename unpacket_traits< Packet >::type *from)
Definition: GenericPacketMath.h:752
__m256 Packet8f
Definition: AVX/PacketMath.h:34
EIGEN_STRONG_INLINE void pstoreu< float >(float *to, const Packet4f &from)
Definition: AltiVec/PacketMath.h:1756
EIGEN_STRONG_INLINE Packet8f pset1< Packet8f >(const float &from)
Definition: AVX/PacketMath.h:748
EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f &a, const Packet4f &b)
Definition: AltiVec/PacketMath.h:1329
__m512 Packet16f
Definition: AVX512/PacketMath.h:34
EIGEN_DEVICE_FUNC void pscatter< float, Packet8f >(float *to, const Packet8f &from, Index stride)
Definition: AVX/PacketMath.h:1687
EIGEN_DEVICE_FUNC Packet pset1(const typename unpacket_traits< Packet >::type &a)
Definition: GenericPacketMath.h:804
std::int32_t int32_t
Definition: Meta.h:41
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70
std::array< T, N > array
Definition: EmulateArray.h:231
squared absolute value
Definition: GlobalFunctions.h:87
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:83
int delta
Definition: MultiOpt.py:96
Definition: Eigen_Colamd.h:49
@ HasASin
Definition: GenericPacketMath.h:84
@ HasATanh
Definition: GenericPacketMath.h:87
@ HasRsqrt
Definition: GenericPacketMath.h:74
@ HasSin
Definition: GenericPacketMath.h:81
@ HasBlend
Definition: GenericPacketMath.h:66
@ HasACos
Definition: GenericPacketMath.h:85
@ HasAbsDiff
Definition: GenericPacketMath.h:65
@ HasArg
Definition: GenericPacketMath.h:64
@ HasNdtri
Definition: GenericPacketMath.h:97
@ HasCos
Definition: GenericPacketMath.h:82
@ HasCmp
Definition: GenericPacketMath.h:69
@ HasShift
Definition: GenericPacketMath.h:50
@ HasExp
Definition: GenericPacketMath.h:75
@ HasSqrt
Definition: GenericPacketMath.h:73
@ HasErf
Definition: GenericPacketMath.h:95
@ HasBessel
Definition: GenericPacketMath.h:98
@ HasLog
Definition: GenericPacketMath.h:77
@ HasTanh
Definition: GenericPacketMath.h:90
@ HasATan
Definition: GenericPacketMath.h:86
@ HasDiv
Definition: GenericPacketMath.h:71
T type
Definition: GenericPacketMath.h:109
@ size
Definition: GenericPacketMath.h:113
@ AlignedOnScalar
Definition: GenericPacketMath.h:114
@ Vectorizable
Definition: GenericPacketMath.h:112
T half
Definition: GenericPacketMath.h:110
@ HasSub
Definition: GenericPacketMath.h:118
@ HasMax
Definition: GenericPacketMath.h:124
@ HasNegate
Definition: GenericPacketMath.h:120
@ HasMul
Definition: GenericPacketMath.h:119
@ HasAdd
Definition: GenericPacketMath.h:117
@ HasSetLinear
Definition: GenericPacketMath.h:126
@ HasMin
Definition: GenericPacketMath.h:123
@ HasConj
Definition: GenericPacketMath.h:125
@ HasAbs2
Definition: GenericPacketMath.h:122
@ HasAbs
Definition: GenericPacketMath.h:121
@ size
Definition: AVX512/PacketMath.h:187
@ alignment
Definition: AVX512/PacketMath.h:188
@ alignment
Definition: AVX/PacketMath.h:312
@ size
Definition: AVX/PacketMath.h:311
T type
Definition: GenericPacketMath.h:135
T half
Definition: GenericPacketMath.h:136
@ masked_load_available
Definition: GenericPacketMath.h:142
@ size
Definition: GenericPacketMath.h:139
@ masked_store_available
Definition: GenericPacketMath.h:143
@ vectorizable
Definition: GenericPacketMath.h:141
@ alignment
Definition: GenericPacketMath.h:140