arch/SSE/MathFunctions.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2007 Julien Pommier
5 // Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
6 //
7 // This Source Code Form is subject to the terms of the Mozilla
8 // Public License v. 2.0. If a copy of the MPL was not distributed
9 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
10 
11 /* The sin and cos and functions of this file come from
12  * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
13  */
14 
15 #ifndef EIGEN_MATH_FUNCTIONS_SSE_H
16 #define EIGEN_MATH_FUNCTIONS_SSE_H
17 
18 // IWYU pragma: private
19 #include "../../InternalHeaderCheck.h"
20 
21 namespace Eigen {
22 
23 namespace internal {
24 
27 
28 // Notice that for newer processors, it is counterproductive to use Newton
29 // iteration for square root. In particular, Skylake and Zen2 processors
30 // have approximately doubled throughput of the _mm_sqrt_ps instruction
31 // compared to their predecessors.
32 template <>
34  return _mm_sqrt_ps(x);
35 }
36 template <>
38  return _mm_sqrt_pd(x);
39 }
40 template <>
42  return x;
43 }
44 
45 #if EIGEN_FAST_MATH
46 // Even on Skylake, using Newton iteration is a win for reciprocal square root.
47 template <>
49  return generic_rsqrt_newton_step<Packet4f, /*Steps=*/1>::run(x, _mm_rsqrt_ps(x));
50 }
51 
52 #ifdef EIGEN_VECTORIZE_FMA
53 // Trying to speed up reciprocal using Newton-Raphson is counterproductive
54 // unless FMA is available. Without FMA pdiv(pset1<Packet>(Scalar(1),a)) is
55 // 30% faster.
56 template <>
58  return generic_reciprocal_newton_step<Packet4f, /*Steps=*/1>::run(x, _mm_rcp_ps(x));
59 }
60 #endif
61 
62 #endif
63 
64 } // end namespace internal
65 
66 namespace numext {
67 
68 template <>
70  return internal::pfirst(internal::Packet4f(_mm_sqrt_ss(_mm_set_ss(x))));
71 }
72 
73 template <>
75 #if EIGEN_COMP_GNUC_STRICT
76  // This works around a GCC bug generating poor code for _mm_sqrt_pd
77  // See https://gitlab.com/libeigen/eigen/commit/8dca9f97e38970
78  return internal::pfirst(internal::Packet2d(__builtin_ia32_sqrtsd(_mm_set_sd(x))));
79 #else
80  return internal::pfirst(internal::Packet2d(_mm_sqrt_pd(_mm_set_sd(x))));
81 #endif
82 }
83 
84 } // namespace numext
85 
86 } // end namespace Eigen
87 
88 #endif // EIGEN_MATH_FUNCTIONS_SSE_H
#define EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(PACKET)
Definition: GenericPacketMathFunctionsFwd.h:203
#define EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(PACKET)
Definition: GenericPacketMathFunctionsFwd.h:188
#define EIGEN_ALWAYS_INLINE
Definition: Macros.h:845
#define EIGEN_UNUSED
Definition: Macros.h:940
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:892
#define EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Definition: Macros.h:900
#define EIGEN_STRONG_INLINE
Definition: Macros.h:834
__m128d Packet2d
Definition: LSX/PacketMath.h:36
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f psqrt< Packet4f >(const Packet4f &x)
Definition: arch/AltiVec/MathFunctions.h:68
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16b psqrt< Packet16b >(const Packet16b &x)
Definition: arch/SSE/MathFunctions.h:41
EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet8bf &a)
Definition: AltiVec/PacketMath.h:2418
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2d psqrt< Packet2d >(const Packet2d &x)
Definition: arch/SSE/MathFunctions.h:37
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f prsqrt< Packet4f >(const Packet4f &x)
Definition: arch/ZVector/MathFunctions.h:209
__vector float Packet4f
Definition: AltiVec/PacketMath.h:33
EIGEN_STRONG_INLINE Packet4f preciprocal< Packet4f >(const Packet4f &a)
Definition: LSX/PacketMath.h:2719
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sqrt(const float &x)
Definition: arch/SSE/MathFunctions.h:69
EIGEN_DEVICE_FUNC const Scalar & x
Definition: SpecialFunctionsImpl.h:2024
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70
auto run(Kernel kernel, Args &&... args) -> decltype(kernel(args...))
Definition: gpu_test_helper.h:414
Definition: Eigen_Colamd.h:49
list x
Definition: plotDoE.py:28
Definition: GenericPacketMath.h:225