arch/NEON/GeneralBlockPanelKernel.h
Go to the documentation of this file.
1 // IWYU pragma: private
2 #include "../../InternalHeaderCheck.h"
3 
4 namespace Eigen {
5 namespace internal {
6 
7 #if EIGEN_ARCH_ARM && EIGEN_COMP_CLANG
8 
9 // Clang seems to excessively spill registers in the GEBP kernel on 32-bit arm.
10 // Here we specialize gebp_traits to eliminate these register spills.
11 // See #2138.
12 template <>
13 struct gebp_traits<float, float, false, false, Architecture::NEON, GEBPPacketFull>
14  : gebp_traits<float, float, false, false, Architecture::Generic, GEBPPacketFull> {
15  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const {
16  // This volatile inline ASM both acts as a barrier to prevent reordering,
17  // as well as enforces strict register use.
18  asm volatile("vmla.f32 %q[r], %q[c], %q[alpha]" : [r] "+w"(r) : [c] "w"(c), [alpha] "w"(alpha) :);
19  }
20 
21  template <typename LaneIdType>
22  EIGEN_STRONG_INLINE void madd(const Packet4f& a, const Packet4f& b, Packet4f& c, Packet4f&, const LaneIdType&) const {
23  acc(a, b, c);
24  }
25 
26  template <typename LaneIdType>
27  EIGEN_STRONG_INLINE void madd(const Packet4f& a, const QuadPacket<Packet4f>& b, Packet4f& c, Packet4f& tmp,
28  const LaneIdType& lane) const {
29  madd(a, b.get(lane), c, tmp, lane);
30  }
31 };
32 
33 #endif // EIGEN_ARCH_ARM && EIGEN_COMP_CLANG
34 
35 #if EIGEN_ARCH_ARM64
36 
37 #ifndef EIGEN_NEON_GEBP_NR
38 #define EIGEN_NEON_GEBP_NR 8
39 #endif
40 
41 template <>
42 struct gebp_traits<float, float, false, false, Architecture::NEON, GEBPPacketFull>
43  : gebp_traits<float, float, false, false, Architecture::Generic, GEBPPacketFull> {
44  typedef float RhsPacket;
45  typedef float32x4_t RhsPacketx4;
46  enum { nr = EIGEN_NEON_GEBP_NR };
47  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = *b; }
48 
49  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { dest = vld1q_f32(b); }
50 
51  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const { dest = *b; }
52 
53  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
54 
55  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b, dest); }
56 
57  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/,
58  const FixedInt<0>&) const {
59  c = vfmaq_n_f32(c, a, b);
60  }
61  // NOTE: Template parameter inference failed when compiled with Android NDK:
62  // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
63 
64  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
65  const FixedInt<0>&) const {
66  madd_helper<0>(a, b, c);
67  }
68  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
69  const FixedInt<1>&) const {
70  madd_helper<1>(a, b, c);
71  }
72  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
73  const FixedInt<2>&) const {
74  madd_helper<2>(a, b, c);
75  }
76  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
77  const FixedInt<3>&) const {
78  madd_helper<3>(a, b, c);
79  }
80 
81  private:
82  template <int LaneID>
83  EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const {
84 #if EIGEN_GNUC_STRICT_LESS_THAN(9, 0, 0)
85  // 1. workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
86  // vfmaq_laneq_f32 is implemented through a costly dup, which was fixed in gcc9
87  // 2. workaround the gcc register split problem on arm64-neon
88  if (LaneID == 0)
89  asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w"(c) : "w"(a), "w"(b) :);
90  else if (LaneID == 1)
91  asm("fmla %0.4s, %1.4s, %2.s[1]\n" : "+w"(c) : "w"(a), "w"(b) :);
92  else if (LaneID == 2)
93  asm("fmla %0.4s, %1.4s, %2.s[2]\n" : "+w"(c) : "w"(a), "w"(b) :);
94  else if (LaneID == 3)
95  asm("fmla %0.4s, %1.4s, %2.s[3]\n" : "+w"(c) : "w"(a), "w"(b) :);
96 #else
97  c = vfmaq_laneq_f32(c, a, b, LaneID);
98 #endif
99  }
100 };
101 
102 template <>
103 struct gebp_traits<double, double, false, false, Architecture::NEON>
104  : gebp_traits<double, double, false, false, Architecture::Generic> {
105  typedef double RhsPacket;
106  enum { nr = EIGEN_NEON_GEBP_NR };
107  struct RhsPacketx4 {
108  float64x2_t B_0, B_1;
109  };
110 
111  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = *b; }
112 
113  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const {
114  dest.B_0 = vld1q_f64(b);
115  dest.B_1 = vld1q_f64(b + 2);
116  }
117 
118  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b, dest); }
119 
120  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
121 
122  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b, dest); }
123 
124  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/,
125  const FixedInt<0>&) const {
126  c = vfmaq_n_f64(c, a, b);
127  }
128 
129  // NOTE: Template parameter inference failed when compiled with Android NDK:
130  // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
131 
132  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
133  const FixedInt<0>&) const {
134  madd_helper<0>(a, b, c);
135  }
136  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
137  const FixedInt<1>&) const {
138  madd_helper<1>(a, b, c);
139  }
140  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
141  const FixedInt<2>&) const {
142  madd_helper<2>(a, b, c);
143  }
144  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
145  const FixedInt<3>&) const {
146  madd_helper<3>(a, b, c);
147  }
148 
149  private:
150  template <int LaneID>
151  EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const {
152 #if EIGEN_GNUC_STRICT_LESS_THAN(9, 0, 0)
153  // 1. workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
154  // vfmaq_laneq_f64 is implemented through a costly dup, which was fixed in gcc9
155  // 2. workaround the gcc register split problem on arm64-neon
156  if (LaneID == 0)
157  asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w"(c) : "w"(a), "w"(b.B_0) :);
158  else if (LaneID == 1)
159  asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w"(c) : "w"(a), "w"(b.B_0) :);
160  else if (LaneID == 2)
161  asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w"(c) : "w"(a), "w"(b.B_1) :);
162  else if (LaneID == 3)
163  asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w"(c) : "w"(a), "w"(b.B_1) :);
164 #else
165  if (LaneID == 0)
166  c = vfmaq_laneq_f64(c, a, b.B_0, 0);
167  else if (LaneID == 1)
168  c = vfmaq_laneq_f64(c, a, b.B_0, 1);
169  else if (LaneID == 2)
170  c = vfmaq_laneq_f64(c, a, b.B_1, 0);
171  else if (LaneID == 3)
172  c = vfmaq_laneq_f64(c, a, b.B_1, 1);
173 #endif
174  }
175 };
176 
177 // The register at operand 3 of fmla for data type half must be v0~v15, the compiler may not
178 // allocate a required register for the '%2' of inline asm 'fmla %0.8h, %1.8h, %2.h[id]',
179 // so inline assembly can't be used here to advoid the bug that vfmaq_lane_f16 is implemented
180 // through a costly dup in gcc compiler.
181 #if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC && EIGEN_COMP_CLANG
182 
183 template <>
184 struct gebp_traits<half, half, false, false, Architecture::NEON>
185  : gebp_traits<half, half, false, false, Architecture::Generic> {
186  typedef half RhsPacket;
187  typedef float16x4_t RhsPacketx4;
188  typedef float16x4_t PacketHalf;
189  enum { nr = EIGEN_NEON_GEBP_NR };
190 
191  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = *b; }
192 
193  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { dest = vld1_f16((const __fp16*)b); }
194 
195  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const { dest = *b; }
196 
197  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
198 
199  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar*, RhsPacket&) const {
200  // If LHS is a Packet8h, we cannot correctly mimic a ploadquad of the RHS
201  // using a single scalar value.
202  eigen_assert(false && "Cannot loadRhsQuad for a scalar RHS.");
203  }
204 
205  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/,
206  const FixedInt<0>&) const {
207  c = vfmaq_n_f16(c, a, b);
208  }
209  EIGEN_STRONG_INLINE void madd(const PacketHalf& a, const RhsPacket& b, PacketHalf& c, RhsPacket& /*tmp*/,
210  const FixedInt<0>&) const {
211  c = vfma_n_f16(c, a, b);
212  }
213 
214  // NOTE: Template parameter inference failed when compiled with Android NDK:
215  // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
216  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
217  const FixedInt<0>&) const {
218  madd_helper<0>(a, b, c);
219  }
220  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
221  const FixedInt<1>&) const {
222  madd_helper<1>(a, b, c);
223  }
224  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
225  const FixedInt<2>&) const {
226  madd_helper<2>(a, b, c);
227  }
228  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
229  const FixedInt<3>&) const {
230  madd_helper<3>(a, b, c);
231  }
232 
233  private:
234  template <int LaneID>
235  EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const {
236  c = vfmaq_lane_f16(c, a, b, LaneID);
237  }
238 };
239 #endif // EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC && EIGEN_COMP_CLANG
240 #endif // EIGEN_ARCH_ARM64
241 
242 } // namespace internal
243 } // namespace Eigen
#define eigen_assert(x)
Definition: Macros.h:910
#define EIGEN_STRONG_INLINE
Definition: Macros.h:834
Scalar * b
Definition: benchVecAdd.cpp:17
RhsScalar_ RhsScalar
Definition: products/GeneralBlockPanelKernel.h:400
EIGEN_STRONG_INLINE void acc(const AccPacket &c, const ResPacket &alpha, ResPacket &r) const
Definition: products/GeneralBlockPanelKernel.h:499
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacketType &dest) const
Definition: products/GeneralBlockPanelKernel.h:448
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *b, RhsPacketType &dest) const
Definition: products/GeneralBlockPanelKernel.h:457
QuadPacket< RhsPacket > RhsPacketx4
Definition: products/GeneralBlockPanelKernel.h:442
ResPacket AccPacket
Definition: products/GeneralBlockPanelKernel.h:443
std::conditional_t< Vectorizable, ResPacket_, ResScalar > ResPacket
Definition: products/GeneralBlockPanelKernel.h:439
std::conditional_t< Vectorizable, RhsPacket_, RhsScalar > RhsPacket
Definition: products/GeneralBlockPanelKernel.h:438
std::conditional_t< Vectorizable, LhsPacket_, LhsScalar > LhsPacket
Definition: products/GeneralBlockPanelKernel.h:437
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, RhsPacket &dest) const
Definition: products/GeneralBlockPanelKernel.h:463
@ nr
Definition: products/GeneralBlockPanelKernel.h:418
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketType &b, AccPacketType &c, RhsPacketType &tmp, const LaneIdType &) const
Definition: products/GeneralBlockPanelKernel.h:476
RealScalar alpha
Definition: level1_cplx_impl.h:151
const Scalar * a
Definition: level2_cplx_impl.h:32
Eigen::Matrix< Scalar, Dynamic, Dynamic, ColMajor > tmp
Definition: level3_impl.h:365
@ NEON
Definition: Constants.h:473
@ GEBPPacketFull
Definition: products/GeneralBlockPanelKernel.h:20
__vector float Packet4f
Definition: AltiVec/PacketMath.h:33
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70
r
Definition: UniformPSDSelfTest.py:20
int c
Definition: calibrate.py:100
Definition: Eigen_Colamd.h:49
Packet B_0
Definition: products/GeneralBlockPanelKernel.h:344