TensorRandom.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 // Copyright (C) 2018 Mehdi Goli <eigen@codeplay.com> Codeplay Software Ltd.
6 //
7 // This Source Code Form is subject to the terms of the Mozilla
8 // Public License v. 2.0. If a copy of the MPL was not distributed
9 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
10 
11 #ifndef EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
12 #define EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
13 
14 // IWYU pragma: private
15 #include "./InternalHeaderCheck.h"
16 
17 namespace Eigen {
18 namespace internal {
19 
21 #if defined(EIGEN_GPU_COMPILE_PHASE)
22  // We don't support 3d kernels since we currently only use 1 and
23  // 2d kernels.
24  gpu_assert(threadIdx.z == 0);
25  return blockIdx.x * blockDim.x + threadIdx.x + gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
26 #else
27  // Rely on Eigen's random implementation.
28  return random<uint64_t>();
29 #endif
30 }
31 
33  // TODO: Unify with the implementation in the non blocking thread pool.
34  uint64_t current = *state;
35  // Update the internal state
36  *state = current * 6364136223846793005ULL + (stream << 1 | 1);
37  // Generate the random output (using the PCG-XSH-RS scheme)
38  return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
39 }
40 
42  seed = seed ? seed : get_random_seed();
43  return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
44 }
45 
46 template <typename T>
48  unsigned rnd = PCG_XSH_RS_generator(state, stream);
49  return static_cast<T>(rnd);
50 }
51 
52 template <>
53 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state, uint64_t stream) {
54  // Generate 10 random bits for the mantissa, merge with exponent.
55  unsigned rnd = PCG_XSH_RS_generator(state, stream);
56  const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x3ffu) | (static_cast<uint16_t>(15) << 10);
57  Eigen::half result = Eigen::numext::bit_cast<Eigen::half>(half_bits);
58  // Return the final result
59  return result - Eigen::half(1.0f);
60 }
61 
62 template <>
63 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::bfloat16 RandomToTypeUniform<Eigen::bfloat16>(uint64_t* state,
64  uint64_t stream) {
65  // Generate 7 random bits for the mantissa, merge with exponent.
66  unsigned rnd = PCG_XSH_RS_generator(state, stream);
67  const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x7fu) | (static_cast<uint16_t>(127) << 7);
68  Eigen::bfloat16 result = Eigen::numext::bit_cast<Eigen::bfloat16>(half_bits);
69  // Return the final result
70  return result - Eigen::bfloat16(1.0f);
71 }
72 
73 template <>
75  typedef union {
76  uint32_t raw;
77  float fp;
78  } internal;
79  internal result;
80  // Generate 23 random bits for the mantissa mantissa
81  const unsigned rnd = PCG_XSH_RS_generator(state, stream);
82  result.raw = rnd & 0x7fffffu;
83  // Set the exponent
84  result.raw |= (static_cast<uint32_t>(127) << 23);
85  // Return the final result
86  return result.fp - 1.0f;
87 }
88 
89 template <>
91  typedef union {
92  uint64_t raw;
93  double dp;
94  } internal;
95  internal result;
96  result.raw = 0;
97  // Generate 52 random bits for the mantissa
98  // First generate the upper 20 bits
99  unsigned rnd1 = PCG_XSH_RS_generator(state, stream) & 0xfffffu;
100  // The generate the lower 32 bits
101  unsigned rnd2 = PCG_XSH_RS_generator(state, stream);
102  result.raw = (static_cast<uint64_t>(rnd1) << 32) | rnd2;
103  // Set the exponent
104  result.raw |= (static_cast<uint64_t>(1023) << 52);
105  // Return the final result
106  return result.dp - 1.0;
107 }
108 
109 template <>
110 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state,
111  uint64_t stream) {
112  return std::complex<float>(RandomToTypeUniform<float>(state, stream), RandomToTypeUniform<float>(state, stream));
113 }
114 template <>
115 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state,
116  uint64_t stream) {
117  return std::complex<double>(RandomToTypeUniform<double>(state, stream), RandomToTypeUniform<double>(state, stream));
118 }
119 
120 template <typename T>
122  public:
123  static constexpr bool PacketAccess = true;
124 
125  // Uses the given "seed" if non-zero, otherwise uses a random seed.
127  m_state = PCG_XSH_RS_state(seed);
128 #ifdef EIGEN_USE_SYCL
129  // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
130  // Therefore, we need two steps to initializate the m_state.
131  // IN SYCL, the constructor of the functor is s called on the CPU
132  // and we get the clock seed here from the CPU. However, This seed is
133  // the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
134  // and only available on the Operator() function (which is called on the GPU).
135  // Thus for CUDA (((CLOCK + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each
136  // thread but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each
137  // thread adds the (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the
138  // construction similar to CUDA Therefore, the thread Id injection is not available at this stage.
139  // However when the operator() is called the thread ID will be available. So inside the operator,
140  // we add the thrreadID, BlockId,... (which is equivalent of i)
141  // to the seed and construct the unique m_state per thead similar to cuda.
142  m_exec_once = false;
143 #endif
144  }
146  m_state = other.m_state;
147 #ifdef EIGEN_USE_SYCL
148  m_exec_once = other.m_exec_once;
149 #endif
150  }
151 
152  template <typename Index>
154 #ifdef EIGEN_USE_SYCL
155  if (!m_exec_once) {
156  // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
157  // The (i * 6364136223846793005ULL) is the remaining part of the PCG_XSH_RS_state on the GPU side
158  m_state += (i * 6364136223846793005ULL);
159  m_exec_once = true;
160  }
161 #endif
162  T result = RandomToTypeUniform<T>(&m_state, i);
163  return result;
164  }
165 
166  template <typename Packet, typename Index>
168  const int packetSize = internal::unpacket_traits<Packet>::size;
169  EIGEN_ALIGN_MAX T values[packetSize];
170 #ifdef EIGEN_USE_SYCL
171  if (!m_exec_once) {
172  // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
173  m_state += (i * 6364136223846793005ULL);
174  m_exec_once = true;
175  }
176 #endif
178  for (int j = 0; j < packetSize; ++j) {
179  values[j] = RandomToTypeUniform<T>(&m_state, i);
180  }
181  return internal::pload<Packet>(values);
182  }
183 
184  private:
185  mutable uint64_t m_state;
186 #ifdef EIGEN_USE_SYCL
187  mutable bool m_exec_once;
188 #endif
189 };
190 
191 template <typename Scalar>
193  enum {
194  // Rough estimate for floating point, multiplied by ceil(sizeof(T) / sizeof(float)).
195  Cost = 12 * NumTraits<Scalar>::AddCost * ((sizeof(Scalar) + sizeof(float) - 1) / sizeof(float)),
197  };
198 };
199 
200 template <typename T>
202  // Use the ratio of uniform method to generate numbers following a normal
203  // distribution. See for example Numerical Recipes chapter 7.3.9 for the
204  // details.
205  T u, v, q;
206  do {
207  u = RandomToTypeUniform<T>(state, stream);
208  v = T(1.7156) * (RandomToTypeUniform<T>(state, stream) - T(0.5));
209  const T x = u - T(0.449871);
210  const T y = numext::abs(v) + T(0.386595);
211  q = x * x + y * (T(0.196) * y - T(0.25472) * x);
212  } while (q > T(0.27597) && (q > T(0.27846) || v * v > T(-4) * numext::log(u) * u * u));
213 
214  return v / u;
215 }
216 
217 template <>
218 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state,
219  uint64_t stream) {
220  return std::complex<float>(RandomToTypeNormal<float>(state, stream), RandomToTypeNormal<float>(state, stream));
221 }
222 template <>
223 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state,
224  uint64_t stream) {
225  return std::complex<double>(RandomToTypeNormal<double>(state, stream), RandomToTypeNormal<double>(state, stream));
226 }
227 
228 template <typename T>
230  public:
231  static constexpr bool PacketAccess = true;
232 
233  // Uses the given "seed" if non-zero, otherwise uses a random seed.
235  m_state = PCG_XSH_RS_state(seed);
236 #ifdef EIGEN_USE_SYCL
237  // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
238  // Therefore, we need two steps to initializate the m_state.
239  // IN SYCL, the constructor of the functor is s called on the CPU
240  // and we get the clock seed here from the CPU. However, This seed is
241  // the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
242  // and only available on the Operator() function (which is called on the GPU).
243  // Therefore, the thread Id injection is not available at this stage. However when the operator()
244  // is called the thread ID will be available. So inside the operator,
245  // we add the thrreadID, BlockId,... (which is equivalent of i)
246  // to the seed and construct the unique m_state per thead similar to cuda.
247  m_exec_once = false;
248 #endif
249  }
251  m_state = other.m_state;
252 #ifdef EIGEN_USE_SYCL
253  m_exec_once = other.m_exec_once;
254 #endif
255  }
256 
257  template <typename Index>
259 #ifdef EIGEN_USE_SYCL
260  if (!m_exec_once) {
261  // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
262  m_state += (i * 6364136223846793005ULL);
263  m_exec_once = true;
264  }
265 #endif
266  T result = RandomToTypeNormal<T>(&m_state, i);
267  return result;
268  }
269 
270  template <typename Packet, typename Index>
272  const int packetSize = internal::unpacket_traits<Packet>::size;
273  EIGEN_ALIGN_MAX T values[packetSize];
274 #ifdef EIGEN_USE_SYCL
275  if (!m_exec_once) {
276  // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
277  m_state += (i * 6364136223846793005ULL);
278  m_exec_once = true;
279  }
280 #endif
282  for (int j = 0; j < packetSize; ++j) {
283  values[j] = RandomToTypeNormal<T>(&m_state, i);
284  }
285  return internal::pload<Packet>(values);
286  }
287 
288  private:
289  mutable uint64_t m_state;
290 #ifdef EIGEN_USE_SYCL
291  mutable bool m_exec_once;
292 #endif
293 };
294 
295 template <typename Scalar>
297  enum {
298  // On average, we need to generate about 3 random numbers
299  // 15 mul, 8 add, 1.5 logs
303  };
304 };
305 
306 } // end namespace internal
307 } // end namespace Eigen
308 
309 #endif // EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
Array< int, Dynamic, 1 > v
Definition: Array_initializer_list_vector_cxx11.cpp:1
int i
Definition: BiCGSTAB_step_by_step.cpp:9
#define EIGEN_ALIGN_MAX
Definition: ConfigureVectorization.h:146
Eigen::Triplet< double > T
Definition: EigenUnitTest.cpp:11
#define EIGEN_UNROLL_LOOP
Definition: Macros.h:1298
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:892
#define EIGEN_STRONG_INLINE
Definition: Macros.h:834
SCALAR Scalar
Definition: bench_gemm.cpp:45
Definition: TensorRandom.h:229
uint64_t m_state
Definition: TensorRandom.h:289
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed=0)
Definition: TensorRandom.h:234
static constexpr bool PacketAccess
Definition: TensorRandom.h:231
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(Index i) const
Definition: TensorRandom.h:271
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(const NormalRandomGenerator &other)
Definition: TensorRandom.h:250
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(Index i) const
Definition: TensorRandom.h:258
Definition: TensorRandom.h:121
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(Index i) const
Definition: TensorRandom.h:153
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(Index i) const
Definition: TensorRandom.h:167
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(const UniformRandomGenerator &other)
Definition: TensorRandom.h:145
static constexpr bool PacketAccess
Definition: TensorRandom.h:123
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(uint64_t seed=0)
Definition: TensorRandom.h:126
uint64_t m_state
Definition: TensorRandom.h:185
dim3 threadIdx
Definition: gpu_common.h:16
dim3 blockDim
Definition: gpu_common.h:16
dim3 blockIdx
Definition: gpu_common.h:16
const Scalar & y
Definition: RandomImpl.h:36
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t get_random_seed()
Definition: TensorRandom.h:20
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T RandomToTypeNormal(uint64_t *state, uint64_t stream)
Definition: TensorRandom.h:201
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float RandomToTypeUniform< float >(uint64_t *state, uint64_t stream)
Definition: TensorRandom.h:74
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T RandomToTypeUniform(uint64_t *state, uint64_t stream)
Definition: TensorRandom.h:47
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double RandomToTypeUniform< double >(uint64_t *state, uint64_t stream)
Definition: TensorRandom.h:90
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed)
Definition: TensorRandom.h:41
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t *state, uint64_t stream)
Definition: TensorRandom.h:32
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T log(const T &x)
Definition: MathFunctions.h:1332
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t< NumTraits< T >::IsSigned||NumTraits< T >::IsComplex, typename NumTraits< T >::Real > abs(const T &x)
Definition: MathFunctions.h:1355
EIGEN_DEVICE_FUNC const Scalar & q
Definition: SpecialFunctionsImpl.h:2019
std::uint16_t uint16_t
Definition: Meta.h:38
std::uint32_t uint32_t
Definition: Meta.h:40
std::uint64_t uint64_t
Definition: Meta.h:42
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:83
Definition: Eigen_Colamd.h:49
list x
Definition: plotDoE.py:28
Holds information about the various numeric (i.e. scalar) types allowed by Eigen.
Definition: NumTraits.h:217
Definition: BFloat16.h:101
Definition: Half.h:139
Definition: XprHelper.h:205
@ PacketAccess
Definition: XprHelper.h:206
@ Cost
Definition: XprHelper.h:206
Definition: GenericPacketMath.h:134
std::ptrdiff_t j
Definition: tut_arithmetic_redux_minmax.cpp:2
Definition: ZVector/PacketMath.h:50