SVE/PacketMath.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2020, Arm Limited and Contributors
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_PACKET_MATH_SVE_H
11 #define EIGEN_PACKET_MATH_SVE_H
12 
13 // IWYU pragma: private
14 #include "../../InternalHeaderCheck.h"
15 
16 namespace Eigen {
17 namespace internal {
18 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
19 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
20 #endif
21 
22 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
23 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
24 #endif
25 
26 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
27 
28 template <typename Scalar, int SVEVectorLength>
30  enum { size = SVEVectorLength / (sizeof(Scalar) * CHAR_BIT) };
31 };
32 
33 /********************************* int32 **************************************/
34 typedef svint32_t PacketXi __attribute__((arm_sve_vector_bits(EIGEN_ARM64_SVE_VL)));
35 
36 template <>
38  typedef PacketXi type;
39  typedef PacketXi half; // Half not implemented yet
40  enum {
44 
45  HasAdd = 1,
46  HasSub = 1,
47  HasShift = 1,
48  HasMul = 1,
49  HasNegate = 1,
50  HasAbs = 1,
51  HasArg = 0,
52  HasAbs2 = 1,
53  HasMin = 1,
54  HasMax = 1,
55  HasConj = 1,
57  HasBlend = 0,
58  HasReduxp = 0 // Not implemented in SVE
59  };
60 };
61 
62 template <>
63 struct unpacket_traits<PacketXi> {
65  typedef PacketXi half; // Half not yet implemented
66  enum {
69  vectorizable = true,
72  };
73 };
74 
75 template <>
76 EIGEN_STRONG_INLINE void prefetch<numext::int32_t>(const numext::int32_t* addr) {
77  svprfw(svptrue_b32(), addr, SV_PLDL1KEEP);
78 }
79 
80 template <>
82  return svdup_n_s32(from);
83 }
84 
85 template <>
88  for (int i = 0; i < packet_traits<numext::int32_t>::size; i++) c[i] = i;
89  return svadd_s32_x(svptrue_b32(), pset1<PacketXi>(a), svld1_s32(svptrue_b32(), c));
90 }
91 
92 template <>
93 EIGEN_STRONG_INLINE PacketXi padd<PacketXi>(const PacketXi& a, const PacketXi& b) {
94  return svadd_s32_x(svptrue_b32(), a, b);
95 }
96 
97 template <>
98 EIGEN_STRONG_INLINE PacketXi psub<PacketXi>(const PacketXi& a, const PacketXi& b) {
99  return svsub_s32_x(svptrue_b32(), a, b);
100 }
101 
102 template <>
103 EIGEN_STRONG_INLINE PacketXi pnegate(const PacketXi& a) {
104  return svneg_s32_x(svptrue_b32(), a);
105 }
106 
107 template <>
108 EIGEN_STRONG_INLINE PacketXi pconj(const PacketXi& a) {
109  return a;
110 }
111 
112 template <>
113 EIGEN_STRONG_INLINE PacketXi pmul<PacketXi>(const PacketXi& a, const PacketXi& b) {
114  return svmul_s32_x(svptrue_b32(), a, b);
115 }
116 
117 template <>
118 EIGEN_STRONG_INLINE PacketXi pdiv<PacketXi>(const PacketXi& a, const PacketXi& b) {
119  return svdiv_s32_x(svptrue_b32(), a, b);
120 }
121 
122 template <>
123 EIGEN_STRONG_INLINE PacketXi pmadd(const PacketXi& a, const PacketXi& b, const PacketXi& c) {
124  return svmla_s32_x(svptrue_b32(), c, a, b);
125 }
126 
127 template <>
128 EIGEN_STRONG_INLINE PacketXi pmin<PacketXi>(const PacketXi& a, const PacketXi& b) {
129  return svmin_s32_x(svptrue_b32(), a, b);
130 }
131 
132 template <>
133 EIGEN_STRONG_INLINE PacketXi pmax<PacketXi>(const PacketXi& a, const PacketXi& b) {
134  return svmax_s32_x(svptrue_b32(), a, b);
135 }
136 
137 template <>
138 EIGEN_STRONG_INLINE PacketXi pcmp_le<PacketXi>(const PacketXi& a, const PacketXi& b) {
139  return svdup_n_s32_z(svcmple_s32(svptrue_b32(), a, b), 0xffffffffu);
140 }
141 
142 template <>
143 EIGEN_STRONG_INLINE PacketXi pcmp_lt<PacketXi>(const PacketXi& a, const PacketXi& b) {
144  return svdup_n_s32_z(svcmplt_s32(svptrue_b32(), a, b), 0xffffffffu);
145 }
146 
147 template <>
148 EIGEN_STRONG_INLINE PacketXi pcmp_eq<PacketXi>(const PacketXi& a, const PacketXi& b) {
149  return svdup_n_s32_z(svcmpeq_s32(svptrue_b32(), a, b), 0xffffffffu);
150 }
151 
152 template <>
153 EIGEN_STRONG_INLINE PacketXi ptrue<PacketXi>(const PacketXi& /*a*/) {
154  return svdup_n_s32_x(svptrue_b32(), 0xffffffffu);
155 }
156 
157 template <>
158 EIGEN_STRONG_INLINE PacketXi pzero<PacketXi>(const PacketXi& /*a*/) {
159  return svdup_n_s32_x(svptrue_b32(), 0);
160 }
161 
162 template <>
163 EIGEN_STRONG_INLINE PacketXi pand<PacketXi>(const PacketXi& a, const PacketXi& b) {
164  return svand_s32_x(svptrue_b32(), a, b);
165 }
166 
167 template <>
168 EIGEN_STRONG_INLINE PacketXi por<PacketXi>(const PacketXi& a, const PacketXi& b) {
169  return svorr_s32_x(svptrue_b32(), a, b);
170 }
171 
172 template <>
173 EIGEN_STRONG_INLINE PacketXi pxor<PacketXi>(const PacketXi& a, const PacketXi& b) {
174  return sveor_s32_x(svptrue_b32(), a, b);
175 }
176 
177 template <>
178 EIGEN_STRONG_INLINE PacketXi pandnot<PacketXi>(const PacketXi& a, const PacketXi& b) {
179  return svbic_s32_x(svptrue_b32(), a, b);
180 }
181 
182 template <int N>
184  return svasrd_n_s32_x(svptrue_b32(), a, N);
185 }
186 
187 template <int N>
189  return svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), svreinterpret_u32_s32(a), N));
190 }
191 
192 template <int N>
194  return svlsl_n_s32_x(svptrue_b32(), a, N);
195 }
196 
197 template <>
199  EIGEN_DEBUG_ALIGNED_LOAD return svld1_s32(svptrue_b32(), from);
200 }
201 
202 template <>
204  EIGEN_DEBUG_UNALIGNED_LOAD return svld1_s32(svptrue_b32(), from);
205 }
206 
207 template <>
209  svuint32_t indices = svindex_u32(0, 1); // index {base=0, base+step=1, base+step*2, ...}
210  indices = svzip1_u32(indices, indices); // index in the format {a0, a0, a1, a1, a2, a2, ...}
211  return svld1_gather_u32index_s32(svptrue_b32(), from, indices);
212 }
213 
214 template <>
216  svuint32_t indices = svindex_u32(0, 1); // index {base=0, base+step=1, base+step*2, ...}
217  indices = svzip1_u32(indices, indices); // index in the format {a0, a0, a1, a1, a2, a2, ...}
218  indices = svzip1_u32(indices, indices); // index in the format {a0, a0, a0, a0, a1, a1, a1, a1, ...}
219  return svld1_gather_u32index_s32(svptrue_b32(), from, indices);
220 }
221 
222 template <>
223 EIGEN_STRONG_INLINE void pstore<numext::int32_t>(numext::int32_t* to, const PacketXi& from) {
224  EIGEN_DEBUG_ALIGNED_STORE svst1_s32(svptrue_b32(), to, from);
225 }
226 
227 template <>
228 EIGEN_STRONG_INLINE void pstoreu<numext::int32_t>(numext::int32_t* to, const PacketXi& from) {
229  EIGEN_DEBUG_UNALIGNED_STORE svst1_s32(svptrue_b32(), to, from);
230 }
231 
232 template <>
233 EIGEN_DEVICE_FUNC inline PacketXi pgather<numext::int32_t, PacketXi>(const numext::int32_t* from, Index stride) {
234  // Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...}
235  svint32_t indices = svindex_s32(0, stride);
236  return svld1_gather_s32index_s32(svptrue_b32(), from, indices);
237 }
238 
239 template <>
240 EIGEN_DEVICE_FUNC inline void pscatter<numext::int32_t, PacketXi>(numext::int32_t* to, const PacketXi& from,
241  Index stride) {
242  // Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...}
243  svint32_t indices = svindex_s32(0, stride);
244  svst1_scatter_s32index_s32(svptrue_b32(), to, indices, from);
245 }
246 
247 template <>
249  // svlasta returns the first element if all predicate bits are 0
250  return svlasta_s32(svpfalse_b(), a);
251 }
252 
253 template <>
254 EIGEN_STRONG_INLINE PacketXi preverse(const PacketXi& a) {
255  return svrev_s32(a);
256 }
257 
258 template <>
259 EIGEN_STRONG_INLINE PacketXi pabs(const PacketXi& a) {
260  return svabs_s32_x(svptrue_b32(), a);
261 }
262 
263 template <>
265  return static_cast<numext::int32_t>(svaddv_s32(svptrue_b32(), a));
266 }
267 
268 template <>
270  EIGEN_STATIC_ASSERT((EIGEN_ARM64_SVE_VL % 128 == 0), EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT);
271 
272  // Multiply the vector by its reverse
273  svint32_t prod = svmul_s32_x(svptrue_b32(), a, svrev_s32(a));
274  svint32_t half_prod;
275 
276  // Extract the high half of the vector. Depending on the VL more reductions need to be done
277  if (EIGEN_ARM64_SVE_VL >= 2048) {
278  half_prod = svtbl_s32(prod, svindex_u32(32, 1));
279  prod = svmul_s32_x(svptrue_b32(), prod, half_prod);
280  }
281  if (EIGEN_ARM64_SVE_VL >= 1024) {
282  half_prod = svtbl_s32(prod, svindex_u32(16, 1));
283  prod = svmul_s32_x(svptrue_b32(), prod, half_prod);
284  }
285  if (EIGEN_ARM64_SVE_VL >= 512) {
286  half_prod = svtbl_s32(prod, svindex_u32(8, 1));
287  prod = svmul_s32_x(svptrue_b32(), prod, half_prod);
288  }
289  if (EIGEN_ARM64_SVE_VL >= 256) {
290  half_prod = svtbl_s32(prod, svindex_u32(4, 1));
291  prod = svmul_s32_x(svptrue_b32(), prod, half_prod);
292  }
293  // Last reduction
294  half_prod = svtbl_s32(prod, svindex_u32(2, 1));
295  prod = svmul_s32_x(svptrue_b32(), prod, half_prod);
296 
297  // The reduction is done to the first element.
298  return pfirst<PacketXi>(prod);
299 }
300 
301 template <>
303  return svminv_s32(svptrue_b32(), a);
304 }
305 
306 template <>
308  return svmaxv_s32(svptrue_b32(), a);
309 }
310 
311 template <int N>
313  int buffer[packet_traits<numext::int32_t>::size * N] = {0};
314  int i = 0;
315 
316  PacketXi stride_index = svindex_s32(0, N);
317 
318  for (i = 0; i < N; i++) {
319  svst1_scatter_s32index_s32(svptrue_b32(), buffer + i, stride_index, kernel.packet[i]);
320  }
321  for (i = 0; i < N; i++) {
322  kernel.packet[i] = svld1_s32(svptrue_b32(), buffer + i * packet_traits<numext::int32_t>::size);
323  }
324 }
325 
326 /********************************* float32 ************************************/
327 
328 typedef svfloat32_t PacketXf __attribute__((arm_sve_vector_bits(EIGEN_ARM64_SVE_VL)));
329 
330 template <>
331 struct packet_traits<float> : default_packet_traits {
332  typedef PacketXf type;
333  typedef PacketXf half;
334 
335  enum {
336  Vectorizable = 1,
337  AlignedOnScalar = 1,
339 
340  HasAdd = 1,
341  HasSub = 1,
342  HasShift = 1,
343  HasMul = 1,
344  HasNegate = 1,
345  HasAbs = 1,
346  HasArg = 0,
347  HasAbs2 = 1,
348  HasMin = 1,
349  HasMax = 1,
350  HasConj = 1,
351  HasSetLinear = 0,
352  HasBlend = 0,
353  HasReduxp = 0, // Not implemented in SVE
354 
355  HasDiv = 1,
356 
359  HasLog = 1,
360  HasExp = 1,
361  HasSqrt = 1,
365  };
366 };
367 
368 template <>
369 struct unpacket_traits<PacketXf> {
370  typedef float type;
371  typedef PacketXf half; // Half not yet implemented
372  typedef PacketXi integer_packet;
373 
374  enum {
377  vectorizable = true,
379  masked_store_available = false
380  };
381 };
382 
383 template <>
384 EIGEN_STRONG_INLINE PacketXf pset1<PacketXf>(const float& from) {
385  return svdup_n_f32(from);
386 }
387 
388 template <>
390  return svreinterpret_f32_u32(svdup_n_u32_x(svptrue_b32(), from));
391 }
392 
393 template <>
394 EIGEN_STRONG_INLINE PacketXf plset<PacketXf>(const float& a) {
396  for (int i = 0; i < packet_traits<float>::size; i++) c[i] = i;
397  return svadd_f32_x(svptrue_b32(), pset1<PacketXf>(a), svld1_f32(svptrue_b32(), c));
398 }
399 
400 template <>
401 EIGEN_STRONG_INLINE PacketXf padd<PacketXf>(const PacketXf& a, const PacketXf& b) {
402  return svadd_f32_x(svptrue_b32(), a, b);
403 }
404 
405 template <>
406 EIGEN_STRONG_INLINE PacketXf psub<PacketXf>(const PacketXf& a, const PacketXf& b) {
407  return svsub_f32_x(svptrue_b32(), a, b);
408 }
409 
410 template <>
411 EIGEN_STRONG_INLINE PacketXf pnegate(const PacketXf& a) {
412  return svneg_f32_x(svptrue_b32(), a);
413 }
414 
415 template <>
416 EIGEN_STRONG_INLINE PacketXf pconj(const PacketXf& a) {
417  return a;
418 }
419 
420 template <>
421 EIGEN_STRONG_INLINE PacketXf pmul<PacketXf>(const PacketXf& a, const PacketXf& b) {
422  return svmul_f32_x(svptrue_b32(), a, b);
423 }
424 
425 template <>
426 EIGEN_STRONG_INLINE PacketXf pdiv<PacketXf>(const PacketXf& a, const PacketXf& b) {
427  return svdiv_f32_x(svptrue_b32(), a, b);
428 }
429 
430 template <>
431 EIGEN_STRONG_INLINE PacketXf pmadd(const PacketXf& a, const PacketXf& b, const PacketXf& c) {
432  return svmla_f32_x(svptrue_b32(), c, a, b);
433 }
434 
435 template <>
436 EIGEN_STRONG_INLINE PacketXf pmin<PacketXf>(const PacketXf& a, const PacketXf& b) {
437  return svmin_f32_x(svptrue_b32(), a, b);
438 }
439 
440 template <>
441 EIGEN_STRONG_INLINE PacketXf pmin<PropagateNaN, PacketXf>(const PacketXf& a, const PacketXf& b) {
442  return pmin<PacketXf>(a, b);
443 }
444 
445 template <>
446 EIGEN_STRONG_INLINE PacketXf pmin<PropagateNumbers, PacketXf>(const PacketXf& a, const PacketXf& b) {
447  return svminnm_f32_x(svptrue_b32(), a, b);
448 }
449 
450 template <>
451 EIGEN_STRONG_INLINE PacketXf pmax<PacketXf>(const PacketXf& a, const PacketXf& b) {
452  return svmax_f32_x(svptrue_b32(), a, b);
453 }
454 
455 template <>
456 EIGEN_STRONG_INLINE PacketXf pmax<PropagateNaN, PacketXf>(const PacketXf& a, const PacketXf& b) {
457  return pmax<PacketXf>(a, b);
458 }
459 
460 template <>
461 EIGEN_STRONG_INLINE PacketXf pmax<PropagateNumbers, PacketXf>(const PacketXf& a, const PacketXf& b) {
462  return svmaxnm_f32_x(svptrue_b32(), a, b);
463 }
464 
465 // Float comparisons in SVE return svbool (predicate). Use svdup to set active
466 // lanes to 1 (0xffffffffu) and inactive lanes to 0.
467 template <>
468 EIGEN_STRONG_INLINE PacketXf pcmp_le<PacketXf>(const PacketXf& a, const PacketXf& b) {
469  return svreinterpret_f32_u32(svdup_n_u32_z(svcmple_f32(svptrue_b32(), a, b), 0xffffffffu));
470 }
471 
472 template <>
473 EIGEN_STRONG_INLINE PacketXf pcmp_lt<PacketXf>(const PacketXf& a, const PacketXf& b) {
474  return svreinterpret_f32_u32(svdup_n_u32_z(svcmplt_f32(svptrue_b32(), a, b), 0xffffffffu));
475 }
476 
477 template <>
478 EIGEN_STRONG_INLINE PacketXf pcmp_eq<PacketXf>(const PacketXf& a, const PacketXf& b) {
479  return svreinterpret_f32_u32(svdup_n_u32_z(svcmpeq_f32(svptrue_b32(), a, b), 0xffffffffu));
480 }
481 
482 // Do a predicate inverse (svnot_b_z) on the predicate resulted from the
483 // greater/equal comparison (svcmpge_f32). Then fill a float vector with the
484 // active elements.
485 template <>
486 EIGEN_STRONG_INLINE PacketXf pcmp_lt_or_nan<PacketXf>(const PacketXf& a, const PacketXf& b) {
487  return svreinterpret_f32_u32(svdup_n_u32_z(svnot_b_z(svptrue_b32(), svcmpge_f32(svptrue_b32(), a, b)), 0xffffffffu));
488 }
489 
490 template <>
491 EIGEN_STRONG_INLINE PacketXf pfloor<PacketXf>(const PacketXf& a) {
492  return svrintm_f32_x(svptrue_b32(), a);
493 }
494 
495 template <>
496 EIGEN_STRONG_INLINE PacketXf ptrue<PacketXf>(const PacketXf& /*a*/) {
497  return svreinterpret_f32_u32(svdup_n_u32_x(svptrue_b32(), 0xffffffffu));
498 }
499 
500 // Logical Operations are not supported for float, so reinterpret casts
501 template <>
502 EIGEN_STRONG_INLINE PacketXf pand<PacketXf>(const PacketXf& a, const PacketXf& b) {
503  return svreinterpret_f32_u32(svand_u32_x(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
504 }
505 
506 template <>
507 EIGEN_STRONG_INLINE PacketXf por<PacketXf>(const PacketXf& a, const PacketXf& b) {
508  return svreinterpret_f32_u32(svorr_u32_x(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
509 }
510 
511 template <>
512 EIGEN_STRONG_INLINE PacketXf pxor<PacketXf>(const PacketXf& a, const PacketXf& b) {
513  return svreinterpret_f32_u32(sveor_u32_x(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
514 }
515 
516 template <>
517 EIGEN_STRONG_INLINE PacketXf pandnot<PacketXf>(const PacketXf& a, const PacketXf& b) {
518  return svreinterpret_f32_u32(svbic_u32_x(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
519 }
520 
521 template <>
522 EIGEN_STRONG_INLINE PacketXf pload<PacketXf>(const float* from) {
523  EIGEN_DEBUG_ALIGNED_LOAD return svld1_f32(svptrue_b32(), from);
524 }
525 
526 template <>
527 EIGEN_STRONG_INLINE PacketXf ploadu<PacketXf>(const float* from) {
528  EIGEN_DEBUG_UNALIGNED_LOAD return svld1_f32(svptrue_b32(), from);
529 }
530 
531 template <>
532 EIGEN_STRONG_INLINE PacketXf ploaddup<PacketXf>(const float* from) {
533  svuint32_t indices = svindex_u32(0, 1); // index {base=0, base+step=1, base+step*2, ...}
534  indices = svzip1_u32(indices, indices); // index in the format {a0, a0, a1, a1, a2, a2, ...}
535  return svld1_gather_u32index_f32(svptrue_b32(), from, indices);
536 }
537 
538 template <>
539 EIGEN_STRONG_INLINE PacketXf ploadquad<PacketXf>(const float* from) {
540  svuint32_t indices = svindex_u32(0, 1); // index {base=0, base+step=1, base+step*2, ...}
541  indices = svzip1_u32(indices, indices); // index in the format {a0, a0, a1, a1, a2, a2, ...}
542  indices = svzip1_u32(indices, indices); // index in the format {a0, a0, a0, a0, a1, a1, a1, a1, ...}
543  return svld1_gather_u32index_f32(svptrue_b32(), from, indices);
544 }
545 
546 template <>
547 EIGEN_STRONG_INLINE void pstore<float>(float* to, const PacketXf& from) {
548  EIGEN_DEBUG_ALIGNED_STORE svst1_f32(svptrue_b32(), to, from);
549 }
550 
551 template <>
552 EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const PacketXf& from) {
553  EIGEN_DEBUG_UNALIGNED_STORE svst1_f32(svptrue_b32(), to, from);
554 }
555 
556 template <>
557 EIGEN_DEVICE_FUNC inline PacketXf pgather<float, PacketXf>(const float* from, Index stride) {
558  // Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...}
559  svint32_t indices = svindex_s32(0, stride);
560  return svld1_gather_s32index_f32(svptrue_b32(), from, indices);
561 }
562 
563 template <>
564 EIGEN_DEVICE_FUNC inline void pscatter<float, PacketXf>(float* to, const PacketXf& from, Index stride) {
565  // Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...}
566  svint32_t indices = svindex_s32(0, stride);
567  svst1_scatter_s32index_f32(svptrue_b32(), to, indices, from);
568 }
569 
570 template <>
571 EIGEN_STRONG_INLINE float pfirst<PacketXf>(const PacketXf& a) {
572  // svlasta returns the first element if all predicate bits are 0
573  return svlasta_f32(svpfalse_b(), a);
574 }
575 
576 template <>
577 EIGEN_STRONG_INLINE PacketXf preverse(const PacketXf& a) {
578  return svrev_f32(a);
579 }
580 
581 template <>
582 EIGEN_STRONG_INLINE PacketXf pabs(const PacketXf& a) {
583  return svabs_f32_x(svptrue_b32(), a);
584 }
585 
586 // TODO(tellenbach): Should this go into MathFunctions.h? If so, change for
587 // all vector extensions and the generic version.
588 template <>
589 EIGEN_STRONG_INLINE PacketXf pfrexp<PacketXf>(const PacketXf& a, PacketXf& exponent) {
590  return pfrexp_generic(a, exponent);
591 }
592 
593 template <>
594 EIGEN_STRONG_INLINE float predux<PacketXf>(const PacketXf& a) {
595  return svaddv_f32(svptrue_b32(), a);
596 }
597 
598 // Other reduction functions:
599 // mul
600 // Only works for SVE Vls multiple of 128
601 template <>
603  EIGEN_STATIC_ASSERT((EIGEN_ARM64_SVE_VL % 128 == 0), EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT);
604  // Multiply the vector by its reverse
605  svfloat32_t prod = svmul_f32_x(svptrue_b32(), a, svrev_f32(a));
606  svfloat32_t half_prod;
607 
608  // Extract the high half of the vector. Depending on the VL more reductions need to be done
609  if (EIGEN_ARM64_SVE_VL >= 2048) {
610  half_prod = svtbl_f32(prod, svindex_u32(32, 1));
611  prod = svmul_f32_x(svptrue_b32(), prod, half_prod);
612  }
613  if (EIGEN_ARM64_SVE_VL >= 1024) {
614  half_prod = svtbl_f32(prod, svindex_u32(16, 1));
615  prod = svmul_f32_x(svptrue_b32(), prod, half_prod);
616  }
617  if (EIGEN_ARM64_SVE_VL >= 512) {
618  half_prod = svtbl_f32(prod, svindex_u32(8, 1));
619  prod = svmul_f32_x(svptrue_b32(), prod, half_prod);
620  }
621  if (EIGEN_ARM64_SVE_VL >= 256) {
622  half_prod = svtbl_f32(prod, svindex_u32(4, 1));
623  prod = svmul_f32_x(svptrue_b32(), prod, half_prod);
624  }
625  // Last reduction
626  half_prod = svtbl_f32(prod, svindex_u32(2, 1));
627  prod = svmul_f32_x(svptrue_b32(), prod, half_prod);
628 
629  // The reduction is done to the first element.
630  return pfirst<PacketXf>(prod);
631 }
632 
633 template <>
635  return svminv_f32(svptrue_b32(), a);
636 }
637 
638 template <>
640  return svmaxv_f32(svptrue_b32(), a);
641 }
642 
643 template <int N>
645  float buffer[packet_traits<float>::size * N] = {0};
646  int i = 0;
647 
648  PacketXi stride_index = svindex_s32(0, N);
649 
650  for (i = 0; i < N; i++) {
651  svst1_scatter_s32index_f32(svptrue_b32(), buffer + i, stride_index, kernel.packet[i]);
652  }
653 
654  for (i = 0; i < N; i++) {
655  kernel.packet[i] = svld1_f32(svptrue_b32(), buffer + i * packet_traits<float>::size);
656  }
657 }
658 
659 template <>
660 EIGEN_STRONG_INLINE PacketXf pldexp<PacketXf>(const PacketXf& a, const PacketXf& exponent) {
661  return pldexp_generic(a, exponent);
662 }
663 
664 template <>
665 EIGEN_STRONG_INLINE PacketXf psqrt<PacketXf>(const PacketXf& a) {
666  return svsqrt_f32_x(svptrue_b32(), a);
667 }
668 
669 } // namespace internal
670 } // namespace Eigen
671 
672 #endif // EIGEN_PACKET_MATH_SVE_H
int i
Definition: BiCGSTAB_step_by_step.cpp:9
#define EIGEN_DEBUG_ALIGNED_STORE
Definition: GenericPacketMath.h:38
#define EIGEN_DEBUG_ALIGNED_LOAD
Definition: GenericPacketMath.h:30
#define EIGEN_DEBUG_UNALIGNED_STORE
Definition: GenericPacketMath.h:42
#define EIGEN_DEBUG_UNALIGNED_LOAD
Definition: GenericPacketMath.h:34
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:892
#define EIGEN_FAST_MATH
Definition: Macros.h:51
#define EIGEN_STRONG_INLINE
Definition: Macros.h:834
#define EIGEN_STATIC_ASSERT(X, MSG)
Definition: StaticAssert.h:26
Scalar Scalar int size
Definition: benchVecAdd.cpp:17
Scalar * b
Definition: benchVecAdd.cpp:17
SCALAR Scalar
Definition: bench_gemm.cpp:45
@ N
Definition: constructor.cpp:22
@ Aligned64
Definition: Constants.h:239
const Scalar * a
Definition: level2_cplx_impl.h:32
EIGEN_STRONG_INLINE PacketXf pxor< PacketXf >(const PacketXf &a, const PacketXf &b)
Definition: SVE/PacketMath.h:512
EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf &a)
Definition: AltiVec/Complex.h:268
EIGEN_STRONG_INLINE PacketXf pmin< PropagateNumbers, PacketXf >(const PacketXf &a, const PacketXf &b)
Definition: SVE/PacketMath.h:446
EIGEN_STRONG_INLINE PacketXf pmul< PacketXf >(const PacketXf &a, const PacketXf &b)
Definition: SVE/PacketMath.h:421
EIGEN_STRONG_INLINE float predux_mul< PacketXf >(const PacketXf &a)
Definition: SVE/PacketMath.h:602
EIGEN_STRONG_INLINE numext::int32_t predux_max< PacketXi >(const PacketXi &a)
Definition: SVE/PacketMath.h:307
EIGEN_STRONG_INLINE PacketXf pcmp_lt< PacketXf >(const PacketXf &a, const PacketXf &b)
Definition: SVE/PacketMath.h:473
EIGEN_STRONG_INLINE PacketXi por< PacketXi >(const PacketXi &a, const PacketXi &b)
Definition: SVE/PacketMath.h:168
EIGEN_STRONG_INLINE PacketXf pfloor< PacketXf >(const PacketXf &a)
Definition: SVE/PacketMath.h:491
EIGEN_STRONG_INLINE PacketXi ploaddup< PacketXi >(const numext::int32_t *from)
Definition: SVE/PacketMath.h:208
EIGEN_STRONG_INLINE PacketXi psub< PacketXi >(const PacketXi &a, const PacketXi &b)
Definition: SVE/PacketMath.h:98
EIGEN_STRONG_INLINE float predux< PacketXf >(const PacketXf &a)
Definition: SVE/PacketMath.h:594
EIGEN_STRONG_INLINE PacketXf pcmp_lt_or_nan< PacketXf >(const PacketXf &a, const PacketXf &b)
Definition: SVE/PacketMath.h:486
EIGEN_STRONG_INLINE PacketXf pcmp_eq< PacketXf >(const PacketXf &a, const PacketXf &b)
Definition: SVE/PacketMath.h:478
EIGEN_STRONG_INLINE float pfirst< PacketXf >(const PacketXf &a)
Definition: SVE/PacketMath.h:571
EIGEN_STRONG_INLINE void ptranspose(PacketBlock< Packet2cf, 2 > &kernel)
Definition: AltiVec/Complex.h:339
EIGEN_STRONG_INLINE PacketXi pload< PacketXi >(const numext::int32_t *from)
Definition: SVE/PacketMath.h:198
EIGEN_STRONG_INLINE PacketXf pandnot< PacketXf >(const PacketXf &a, const PacketXf &b)
Definition: SVE/PacketMath.h:517
EIGEN_STRONG_INLINE numext::int32_t predux< PacketXi >(const PacketXi &a)
Definition: SVE/PacketMath.h:264
EIGEN_STRONG_INLINE PacketXf padd< PacketXf >(const PacketXf &a, const PacketXf &b)
Definition: SVE/PacketMath.h:401
EIGEN_STRONG_INLINE PacketXf pcmp_le< PacketXf >(const PacketXf &a, const PacketXf &b)
Definition: SVE/PacketMath.h:468
EIGEN_STRONG_INLINE PacketXf pmax< PropagateNaN, PacketXf >(const PacketXf &a, const PacketXf &b)
Definition: SVE/PacketMath.h:456
EIGEN_STRONG_INLINE PacketXi plset< PacketXi >(const numext::int32_t &a)
Definition: SVE/PacketMath.h:86
EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i &a)
Definition: AltiVec/PacketMath.h:1983
EIGEN_STRONG_INLINE PacketXi pcmp_eq< PacketXi >(const PacketXi &a, const PacketXi &b)
Definition: SVE/PacketMath.h:148
EIGEN_STRONG_INLINE PacketXi pmax< PacketXi >(const PacketXi &a, const PacketXi &b)
Definition: SVE/PacketMath.h:133
EIGEN_STRONG_INLINE PacketXi pdiv< PacketXi >(const PacketXi &a, const PacketXi &b)
Definition: SVE/PacketMath.h:118
EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i &a)
Definition: AltiVec/PacketMath.h:1979
EIGEN_STRONG_INLINE float predux_min< PacketXf >(const PacketXf &a)
Definition: SVE/PacketMath.h:634
EIGEN_STRONG_INLINE PacketXi ptrue< PacketXi >(const PacketXi &)
Definition: SVE/PacketMath.h:153
EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf &a)
Definition: AltiVec/Complex.h:303
EIGEN_STRONG_INLINE PacketXf psub< PacketXf >(const PacketXf &a, const PacketXf &b)
Definition: SVE/PacketMath.h:406
EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
Definition: AltiVec/PacketMath.h:1218
EIGEN_STRONG_INLINE PacketXf por< PacketXf >(const PacketXf &a, const PacketXf &b)
Definition: SVE/PacketMath.h:507
EIGEN_STRONG_INLINE PacketXi ploadquad< PacketXi >(const numext::int32_t *from)
Definition: SVE/PacketMath.h:215
EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf &a)
Definition: AltiVec/Complex.h:264
EIGEN_STRONG_INLINE PacketXf plset< PacketXf >(const float &a)
Definition: SVE/PacketMath.h:394
EIGEN_STRONG_INLINE PacketXf ptrue< PacketXf >(const PacketXf &)
Definition: SVE/PacketMath.h:496
EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i &a)
Definition: AltiVec/PacketMath.h:1975
EIGEN_STRONG_INLINE PacketXi pset1< PacketXi >(const numext::int32_t &from)
Definition: SVE/PacketMath.h:81
EIGEN_STRONG_INLINE PacketXi pxor< PacketXi >(const PacketXi &a, const PacketXi &b)
Definition: SVE/PacketMath.h:173
EIGEN_STRONG_INLINE PacketXf pmin< PacketXf >(const PacketXf &a, const PacketXf &b)
Definition: SVE/PacketMath.h:436
EIGEN_STRONG_INLINE PacketXf pset1frombits< PacketXf >(numext::uint32_t from)
Definition: SVE/PacketMath.h:389
EIGEN_STRONG_INLINE PacketXf pldexp< PacketXf >(const PacketXf &a, const PacketXf &exponent)
Definition: SVE/PacketMath.h:660
EIGEN_STRONG_INLINE PacketXf pmin< PropagateNaN, PacketXf >(const PacketXf &a, const PacketXf &b)
Definition: SVE/PacketMath.h:441
EIGEN_STRONG_INLINE PacketXf pfrexp< PacketXf >(const PacketXf &a, PacketXf &exponent)
Definition: SVE/PacketMath.h:589
EIGEN_STRONG_INLINE PacketXi pcmp_le< PacketXi >(const PacketXi &a, const PacketXi &b)
Definition: SVE/PacketMath.h:138
EIGEN_STRONG_INLINE PacketXi ploadu< PacketXi >(const numext::int32_t *from)
Definition: SVE/PacketMath.h:203
EIGEN_STRONG_INLINE numext::int32_t predux_min< PacketXi >(const PacketXi &a)
Definition: SVE/PacketMath.h:302
EIGEN_STRONG_INLINE void pstore< float >(float *to, const Packet4f &from)
Definition: AltiVec/PacketMath.h:642
EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f &a)
Definition: AltiVec/PacketMath.h:1936
EIGEN_STRONG_INLINE float predux_max< PacketXf >(const PacketXf &a)
Definition: SVE/PacketMath.h:639
EIGEN_STRONG_INLINE PacketXi pcmp_lt< PacketXi >(const PacketXi &a, const PacketXi &b)
Definition: SVE/PacketMath.h:143
EIGEN_STRONG_INLINE PacketXi pmul< PacketXi >(const PacketXi &a, const PacketXi &b)
Definition: SVE/PacketMath.h:113
EIGEN_DEVICE_FUNC void pscatter< float, PacketXf >(float *to, const PacketXf &from, Index stride)
Definition: SVE/PacketMath.h:564
EIGEN_STRONG_INLINE PacketXf pload< PacketXf >(const float *from)
Definition: SVE/PacketMath.h:522
EIGEN_STRONG_INLINE PacketXf pand< PacketXf >(const PacketXf &a, const PacketXf &b)
Definition: SVE/PacketMath.h:502
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_generic(const Packet &a, const Packet &exponent)
Definition: GenericPacketMathFunctions.h:226
EIGEN_STRONG_INLINE PacketXi pmin< PacketXi >(const PacketXi &a, const PacketXi &b)
Definition: SVE/PacketMath.h:128
EIGEN_STRONG_INLINE PacketXf ploadu< PacketXf >(const float *from)
Definition: SVE/PacketMath.h:527
EIGEN_STRONG_INLINE numext::int32_t predux_mul< PacketXi >(const PacketXi &a)
Definition: SVE/PacketMath.h:269
EIGEN_STRONG_INLINE PacketXi pand< PacketXi >(const PacketXi &a, const PacketXi &b)
Definition: SVE/PacketMath.h:163
EIGEN_STRONG_INLINE PacketXf ploadquad< PacketXf >(const float *from)
Definition: SVE/PacketMath.h:539
EIGEN_DEVICE_FUNC PacketXf pgather< float, PacketXf >(const float *from, Index stride)
Definition: SVE/PacketMath.h:557
EIGEN_STRONG_INLINE numext::int32_t pfirst< PacketXi >(const PacketXi &a)
Definition: SVE/PacketMath.h:248
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic(const Packet &a, Packet &exponent)
Definition: GenericPacketMathFunctions.h:184
EIGEN_STRONG_INLINE PacketXi pzero< PacketXi >(const PacketXi &)
Definition: SVE/PacketMath.h:158
EIGEN_STRONG_INLINE PacketXf pmax< PacketXf >(const PacketXf &a, const PacketXf &b)
Definition: SVE/PacketMath.h:451
svint32_t PacketXi __attribute__((arm_sve_vector_bits(EIGEN_ARM64_SVE_VL)))
Definition: SVE/PacketMath.h:34
EIGEN_STRONG_INLINE PacketXf pmax< PropagateNumbers, PacketXf >(const PacketXf &a, const PacketXf &b)
Definition: SVE/PacketMath.h:461
EIGEN_STRONG_INLINE PacketXi pandnot< PacketXi >(const PacketXi &a, const PacketXi &b)
Definition: SVE/PacketMath.h:178
EIGEN_STRONG_INLINE PacketXi padd< PacketXi >(const PacketXi &a, const PacketXi &b)
Definition: SVE/PacketMath.h:93
EIGEN_STRONG_INLINE PacketXf psqrt< PacketXf >(const PacketXf &a)
Definition: SVE/PacketMath.h:665
EIGEN_STRONG_INLINE PacketXf pdiv< PacketXf >(const PacketXf &a, const PacketXf &b)
Definition: SVE/PacketMath.h:426
EIGEN_STRONG_INLINE PacketXf pset1< PacketXf >(const float &from)
Definition: SVE/PacketMath.h:384
EIGEN_STRONG_INLINE void pstoreu< float >(float *to, const Packet4f &from)
Definition: AltiVec/PacketMath.h:1756
EIGEN_STRONG_INLINE PacketXf ploaddup< PacketXf >(const float *from)
Definition: SVE/PacketMath.h:532
std::int32_t int32_t
Definition: Meta.h:41
std::uint32_t uint32_t
Definition: Meta.h:40
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:83
const Product< Lhs, Rhs > prod(const Lhs &lhs, const Rhs &rhs)
Definition: evaluators.cpp:7
int c
Definition: calibrate.py:100
Definition: Eigen_Colamd.h:49
Definition: GenericPacketMath.h:1407
Packet packet[N]
Definition: GenericPacketMath.h:1408
Definition: GenericPacketMath.h:45
@ HasSin
Definition: GenericPacketMath.h:81
@ HasBlend
Definition: GenericPacketMath.h:66
@ HasErfc
Definition: GenericPacketMath.h:96
@ HasArg
Definition: GenericPacketMath.h:64
@ HasCos
Definition: GenericPacketMath.h:82
@ HasShift
Definition: GenericPacketMath.h:50
@ HasExp
Definition: GenericPacketMath.h:75
@ HasSqrt
Definition: GenericPacketMath.h:73
@ HasErf
Definition: GenericPacketMath.h:95
@ HasLog
Definition: GenericPacketMath.h:77
@ HasTanh
Definition: GenericPacketMath.h:90
@ HasDiv
Definition: GenericPacketMath.h:71
PacketXf type
Definition: SVE/PacketMath.h:332
PacketXf half
Definition: SVE/PacketMath.h:333
PacketXi half
Definition: SVE/PacketMath.h:39
PacketXi type
Definition: SVE/PacketMath.h:38
Definition: GenericPacketMath.h:108
@ size
Definition: GenericPacketMath.h:113
@ AlignedOnScalar
Definition: GenericPacketMath.h:114
@ Vectorizable
Definition: GenericPacketMath.h:112
@ HasSub
Definition: GenericPacketMath.h:118
@ HasMax
Definition: GenericPacketMath.h:124
@ HasNegate
Definition: GenericPacketMath.h:120
@ HasMul
Definition: GenericPacketMath.h:119
@ HasAdd
Definition: GenericPacketMath.h:117
@ HasSetLinear
Definition: GenericPacketMath.h:126
@ HasMin
Definition: GenericPacketMath.h:123
@ HasConj
Definition: GenericPacketMath.h:125
@ HasAbs2
Definition: GenericPacketMath.h:122
@ HasAbs
Definition: GenericPacketMath.h:121
Definition: SVE/PacketMath.h:29
@ size
Definition: SVE/PacketMath.h:30
PacketXi integer_packet
Definition: SVE/PacketMath.h:372
float type
Definition: SVE/PacketMath.h:370
PacketXf half
Definition: SVE/PacketMath.h:371
numext::int32_t type
Definition: SVE/PacketMath.h:64
PacketXi half
Definition: SVE/PacketMath.h:65
Definition: GenericPacketMath.h:134
@ masked_load_available
Definition: GenericPacketMath.h:142
@ size
Definition: GenericPacketMath.h:139
@ masked_store_available
Definition: GenericPacketMath.h:143
@ vectorizable
Definition: GenericPacketMath.h:141
@ alignment
Definition: GenericPacketMath.h:140