TensorReductionGpu.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
12 
13 // IWYU pragma: private
14 #include "./InternalHeaderCheck.h"
15 
16 namespace Eigen {
17 namespace internal {
18 
19 #if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
20 // Full reducers for GPU, don't vectorize for now
21 
22 // Reducer function that enables multiple gpu thread to safely accumulate at the same
23 // output address. It basically reads the current value of the output variable, and
24 // attempts to update it with the new value. If in the meantime another gpu thread
25 // updated the content of the output address it will try again.
26 template <typename T, typename R>
27 __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
28 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
29  if (sizeof(T) == 4) {
30  unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
31  unsigned int newval = oldval;
32  reducer.reduce(accum, reinterpret_cast<T*>(&newval));
33  if (newval == oldval) {
34  return;
35  }
36  unsigned int readback;
37  while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
38  oldval = readback;
39  newval = oldval;
40  reducer.reduce(accum, reinterpret_cast<T*>(&newval));
41  if (newval == oldval) {
42  return;
43  }
44  }
45  } else if (sizeof(T) == 8) {
46  unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
47  unsigned long long newval = oldval;
48  reducer.reduce(accum, reinterpret_cast<T*>(&newval));
49  if (newval == oldval) {
50  return;
51  }
52  unsigned long long readback;
53  while ((readback = atomicCAS(reinterpret_cast<unsigned long long*>(output), oldval, newval)) != oldval) {
54  oldval = readback;
55  newval = oldval;
56  reducer.reduce(accum, reinterpret_cast<T*>(&newval));
57  if (newval == oldval) {
58  return;
59  }
60  }
61  } else {
62  gpu_assert(0 && "Wordsize not supported");
63  }
64 #else // EIGEN_CUDA_ARCH >= 300
66  EIGEN_UNUSED_VARIABLE(accum);
67  EIGEN_UNUSED_VARIABLE(reducer);
68  gpu_assert(0 && "Shouldn't be called on unsupported device");
69 #endif // EIGEN_CUDA_ARCH >= 300
70 }
71 
72 // We extend atomicExch to support extra data types
73 template <typename Type>
74 __device__ inline Type atomicExchCustom(Type* address, Type val) {
75  return atomicExch(address, val);
76 }
77 
78 template <>
79 __device__ inline double atomicExchCustom(double* address, double val) {
80  unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
81  return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
82 }
83 
84 #ifdef EIGEN_HAS_GPU_FP16
85 template <typename R>
86 __device__ inline void atomicReduce(half2* output, half2 accum, R& reducer) {
87  unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
88  unsigned int newval = oldval;
89  reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
90  if (newval == oldval) {
91  return;
92  }
93  unsigned int readback;
94  while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
95  oldval = readback;
96  newval = oldval;
97  reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
98  if (newval == oldval) {
99  return;
100  }
101  }
102 }
103 #ifdef EIGEN_GPU_COMPILE_PHASE
104 // reduction should be associative since reduction is not atomic in wide vector but atomic in half2 operations
105 template <typename R>
106 __device__ inline void atomicReduce(Packet4h2* output, Packet4h2 accum, R& reducer) {
107  half2* houtput = reinterpret_cast<half2*>(output);
108  half2* haccum = reinterpret_cast<half2*>(&accum);
109  for (int i = 0; i < 4; ++i) {
110  atomicReduce(houtput + i, *(haccum + i), reducer);
111  }
112 }
113 #endif // EIGEN_GPU_COMPILE_PHASE
114 #endif // EIGEN_HAS_GPU_FP16
115 
116 template <>
117 __device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
118 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
119  atomicAdd(output, accum);
120 #else // EIGEN_CUDA_ARCH >= 300
122  EIGEN_UNUSED_VARIABLE(accum);
123  gpu_assert(0 && "Shouldn't be called on unsupported device");
124 #endif // EIGEN_CUDA_ARCH >= 300
125 }
126 
127 template <typename CoeffType, typename Index>
128 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs,
129  CoeffType* output) {
130  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
131  const Index num_threads = blockDim.x * gridDim.x;
132  for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
133  output[i] = val;
134  }
135 }
136 
137 template <int BlockSize, int NumPerThread, typename Self, typename Reducer, typename Index>
138 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
139  typename Self::CoeffReturnType* output,
140  unsigned int* semaphore) {
141 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
142  // Initialize the output value
143  const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
144  if (gridDim.x == 1) {
145  if (first_index == 0) {
146  *output = reducer.initialize();
147  }
148  } else {
149  if (threadIdx.x == 0) {
150  unsigned int block = atomicCAS(semaphore, 0u, 1u);
151  if (block == 0) {
152  // We're the first block to run, initialize the output value
153  atomicExchCustom(output, reducer.initialize());
154  __threadfence();
155  atomicExch(semaphore, 2u);
156  } else {
157  // Wait for the first block to initialize the output value.
158  // Use atomicCAS here to ensure that the reads aren't cached
159  unsigned int val;
160  do {
161  val = atomicCAS(semaphore, 2u, 2u);
162  } while (val < 2u);
163  }
164  }
165  }
166 
167  __syncthreads();
168 
169  eigen_assert(gridDim.x == 1 || *semaphore >= 2u);
170 
171  typename Self::CoeffReturnType accum = reducer.initialize();
172  Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread * BlockSize);
173  for (Index i = 0; i < max_iter; i += BlockSize) {
174  const Index index = first_index + i;
175  eigen_assert(index < num_coeffs);
176  typename Self::CoeffReturnType val = input.m_impl.coeff(index);
177  reducer.reduce(val, &accum);
178  }
179 
180 #pragma unroll
181  for (int offset = warpSize / 2; offset > 0; offset /= 2) {
182 #if defined(EIGEN_HIPCC)
183  // use std::is_floating_point to determine the type of reduced_val
184  // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error
185  // and list the float and int versions of __shfl_down as the candidate functions.
187  reducer.reduce(__shfl_down(static_cast<float>(accum), offset, warpSize), &accum);
188  } else {
189  reducer.reduce(__shfl_down(static_cast<int>(accum), offset, warpSize), &accum);
190  }
191 #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
192  reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
193 #else
194  reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum);
195 #endif
196  }
197 
198  if ((threadIdx.x & (warpSize - 1)) == 0) {
199  atomicReduce(output, accum, reducer);
200  }
201 
202  if (gridDim.x > 1 && threadIdx.x == 0) {
203  // Let the last block reset the semaphore
204  atomicInc(semaphore, gridDim.x + 1);
205 #if defined(EIGEN_HIPCC)
206  __threadfence_system();
207 #endif
208  }
209 #else // EIGEN_CUDA_ARCH >= 300
210  EIGEN_UNUSED_VARIABLE(reducer);
211  EIGEN_UNUSED_VARIABLE(input);
212  EIGEN_UNUSED_VARIABLE(num_coeffs);
214  EIGEN_UNUSED_VARIABLE(semaphore);
215  gpu_assert(0 && "Shouldn't be called on unsupported device");
216 #endif // EIGEN_CUDA_ARCH >= 300
217 }
218 
219 #ifdef EIGEN_HAS_GPU_FP16
220 template <typename Self, typename Reducer, typename Index>
221 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input,
222  Index num_coeffs, half* scratch) {
223  eigen_assert(blockDim.x == 1);
224  eigen_assert(gridDim.x == 1);
225  typedef packet_traits<Eigen::half>::type packet_type;
226  Index packet_remainder = num_coeffs % Index(unpacket_traits<packet_type>::size);
227  if (packet_remainder != 0) {
228  half2* h2scratch = reinterpret_cast<half2*>(scratch);
229  for (Index i = num_coeffs - packet_remainder; i + 2 <= num_coeffs; i += 2) {
230  *h2scratch = __halves2half2(input.coeff(i), input.coeff(i + 1));
231  h2scratch++;
232  }
233  if ((num_coeffs & 1) != 0) {
234  half lastCoeff = input.coeff(num_coeffs - 1);
235  *h2scratch = __halves2half2(lastCoeff, reducer.initialize());
236  }
237  } else {
238  packet_type reduce = reducer.template initializePacket<packet_type>();
239  internal::pstoreu(scratch, reduce);
240  }
241 }
242 
243 template <typename Self, typename Reducer, typename Index>
244 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernelHalfFloat(Reducer reducer, const Self /*input*/,
245  Index num_coeffs, half* output) {
246  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
247  const Index num_threads = blockDim.x * gridDim.x;
248  typedef typename packet_traits<Eigen::half>::type PacketType;
249 
250  const Index num_packets = num_coeffs / Index(unpacket_traits<PacketType>::size);
251  PacketType* p_output = reinterpret_cast<PacketType*>(output);
252  for (Index i = thread_id; i < num_packets; i += num_threads) {
253  p_output[i] = reducer.template initializePacket<PacketType>();
254  }
255  Index packet_remainder = num_coeffs % Index(unpacket_traits<PacketType>::size);
256  if (thread_id < packet_remainder) {
257  output[num_coeffs - packet_remainder + thread_id] = reducer.initialize();
258  }
259 }
260 
261 template <int BlockSize, int NumPerThread, typename Self, typename Reducer, typename Index>
262 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reducer reducer, const Self input,
263  Index num_coeffs, half* output,
264  half* scratch) {
265  typedef typename packet_traits<Eigen::half>::type PacketType;
266  const int packet_width = unpacket_traits<PacketType>::size;
267  eigen_assert(NumPerThread % packet_width == 0);
268  const Index first_index = blockIdx.x * BlockSize * NumPerThread + packet_width * threadIdx.x;
269 
270  // Initialize the output value if it wasn't initialized by the ReductionInitKernel
271 
272  if (gridDim.x == 1) {
273  if (first_index == 0) {
274  int rem = num_coeffs % packet_width;
275  if (rem != 0) {
276  half2* p_scratch = reinterpret_cast<half2*>(scratch);
277  pstoreu(scratch, reducer.template initializePacket<PacketType>());
278  for (int i = 0; i < rem / 2; i++) {
279  *p_scratch = __halves2half2(input.coeff(num_coeffs - packet_width + 2 * i),
280  input.coeff(num_coeffs - packet_width + 2 * i + 1));
281  p_scratch++;
282  }
283  if ((num_coeffs & 1) != 0) {
284  half last = input.coeff(num_coeffs - 1);
285  *p_scratch = __halves2half2(last, reducer.initialize());
286  }
287  } else {
288  PacketType reduce = reducer.template initializePacket<PacketType>();
289  pstoreu(scratch, reduce);
290  }
291  }
292  __syncthreads();
293  }
294 
295  PacketType accum = reducer.template initializePacket<PacketType>();
296  const Index max_iter =
297  numext::mini<Index>((num_coeffs - first_index) / packet_width, NumPerThread * BlockSize / packet_width);
298  for (Index i = 0; i < max_iter; i += BlockSize) {
299  const Index index = first_index + packet_width * i;
300  eigen_assert(index + packet_width < num_coeffs);
301  PacketType val = input.template packet<Unaligned>(index);
302  reducer.reducePacket(val, &accum);
303  }
304 
305 #pragma unroll
306  for (int offset = warpSize / 2; offset > 0; offset /= 2) {
307 #if defined(EIGEN_HIPCC)
308  PacketType r1;
309  half2* hr = reinterpret_cast<half2*>(&r1);
310  half2* hacc = reinterpret_cast<half2*>(&accum);
311  for (int i = 0; i < packet_width / 2; i++) {
312  // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
313  union {
314  int i;
315  half2 h;
316  } wka_in, wka_out;
317  wka_in.h = hacc[i];
318  wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
319  hr[i] = wka_out.h;
320  }
321  reducer.reducePacket(r1, &accum);
322 #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
323  PacketType r1;
324  half2* hr = reinterpret_cast<half2*>(&r1);
325  half2* hacc = reinterpret_cast<half2*>(&accum);
326  for (int i = 0; i < packet_width / 2; i++) {
327  hr[i] = __shfl_down(hacc[i], offset, warpSize);
328  }
329  reducer.reducePacket(r1, &accum);
330 #else
331  PacketType r1;
332  half2* hr = reinterpret_cast<half2*>(&r1);
333  half2* hacc = reinterpret_cast<half2*>(&accum);
334  for (int i = 0; i < packet_width / 2; i++) {
335  hr[i] = __shfl_down_sync(0xFFFFFFFF, hacc[i], (unsigned)offset, warpSize);
336  }
337  reducer.reducePacket(r1, &accum);
338 
339 #endif
340  }
341 
342  if ((threadIdx.x & (warpSize - 1)) == 0) {
343  atomicReduce(reinterpret_cast<PacketType*>(scratch), accum, reducer);
344  }
345 
346  __syncthreads();
347  half2* rv1 = reinterpret_cast<half2*>(scratch);
348  if (packet_width > 2) {
349  reducer.reducePacket(rv1[2], rv1);
350  reducer.reducePacket(rv1[3], rv1 + 1);
351  reducer.reducePacket(rv1[1], rv1);
352  }
353  if (gridDim.x == 1) {
354  if (first_index == 0) {
355  half tmp = __low2half(*rv1);
356  reducer.reduce(__high2half(*rv1), &tmp);
357  *output = tmp;
358  }
359  }
360 }
361 
362 template <typename Op>
363 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionCleanupKernelHalfFloat(Op reducer, half* output, half* scratch) {
364  eigen_assert(threadIdx.x == 1);
365  typedef packet_traits<Eigen::half>::type packet_type;
367  *output = *scratch;
368  } else {
369  half2* pscratch = reinterpret_cast<half2*>(scratch);
370  half tmp = __float2half(0.f);
371  for (int i = 0; i < unpacket_traits<packet_type>::size; i += 2) {
372  reducer.reduce(__low2half(*pscratch), &tmp);
373  reducer.reduce(__high2half(*pscratch), &tmp);
374  pscratch++;
375  }
376  *output = tmp;
377  }
378 }
379 
380 #endif // EIGEN_HAS_GPU_FP16
381 
382 template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
383 struct FullReductionLauncher {
384  static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
385  gpu_assert(false && "Should only be called on doubles, floats and half floats");
386  }
387 };
388 
389 // Specialization for float and double
390 template <typename Self, typename Op, typename OutputType, bool PacketAccess>
391 struct FullReductionLauncher<
392  Self, Op, OutputType, PacketAccess,
393  std::enable_if_t<internal::is_same<float, OutputType>::value || internal::is_same<double, OutputType>::value,
394  void>> {
395  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output,
396  typename Self::Index num_coeffs) {
397  typedef typename Self::Index Index;
398  const int block_size = 256;
399  const int num_per_thread = 128;
400  const int num_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread);
401 
402  unsigned int* semaphore = NULL;
403  if (num_blocks > 1) {
404  semaphore = device.semaphore();
405  }
406 
407  LAUNCH_GPU_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>), num_blocks, block_size, 0,
408  device, reducer, self, num_coeffs, output, semaphore);
409  }
410 };
411 
412 #ifdef EIGEN_HAS_GPU_FP16
413 template <typename Self, typename Op>
414 struct FullReductionLauncher<Self, Op, Eigen::half, false> {
415  static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
416  gpu_assert(false && "Should not be called since there is no packet accessor");
417  }
418 };
419 
420 template <typename Self, typename Op>
421 struct FullReductionLauncher<Self, Op, Eigen::half, true> {
422  static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output,
423  typename Self::Index num_coeffs) {
424  typedef typename Self::Index Index;
425 
426  const int block_size = 256;
427  const int num_per_thread = 128;
428  const int num_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread);
429  half* scratch = static_cast<half*>(device.scratchpad());
430 
431  if (num_blocks > 1) {
432  // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
433  // won't be a race conditions between multiple thread blocks.
434  LAUNCH_GPU_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>), 1, 1, 0, device, reducer, self,
435  num_coeffs, scratch);
436  }
437 
438  LAUNCH_GPU_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>), num_blocks,
439  block_size, 0, device, reducer, self, num_coeffs, output, scratch);
440 
441  if (num_blocks > 1) {
442  LAUNCH_GPU_KERNEL((ReductionCleanupKernelHalfFloat<Op>), 1, 1, 0, device, reducer, output, scratch);
443  }
444  }
445 };
446 #endif // EIGEN_HAS_GPU_FP16
447 
448 template <typename Self, typename Op, bool Vectorizable>
449 struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
450  // Unfortunately nvidia doesn't support well exotic types such as complex,
451  // so reduce the scope of the optimized version of the code to the simple cases
452  // of doubles, floats and half floats
453 #ifdef EIGEN_HAS_GPU_FP16
454  static constexpr bool HasOptimizedImplementation =
455  !Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
459 #else // EIGEN_HAS_GPU_FP16
460  static constexpr bool HasOptimizedImplementation =
461  !Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
463 #endif // EIGEN_HAS_GPU_FP16
464 
465  template <typename OutputType>
466  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
467  gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
468  const Index num_coeffs = array_prod(self.m_impl.dimensions());
469  // Don't crash when we're called with an input tensor of size 0.
470  if (num_coeffs == 0) {
471  return;
472  }
473 
474  FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device,
475  output, num_coeffs);
476  }
477 };
478 
479 template <int NumPerThread, typename Self, typename Reducer, typename Index>
480 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reducer, const Self input,
481  Index num_coeffs_to_reduce,
482  Index num_preserved_coeffs,
483  typename Self::CoeffReturnType* output) {
484 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
485  typedef typename Self::CoeffReturnType Type;
486  eigen_assert(blockDim.y == 1);
487  eigen_assert(blockDim.z == 1);
488  eigen_assert(gridDim.y == 1);
489  eigen_assert(gridDim.z == 1);
490 
491  const int unroll_times = 16;
492  eigen_assert(NumPerThread % unroll_times == 0);
493 
494  const Index input_col_blocks = numext::div_ceil<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
495  const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
496 
497  const Index num_threads = blockDim.x * gridDim.x;
498  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
499 
500  // Initialize the output values if they weren't initialized by the ReductionInitKernel
501  if (gridDim.x == 1) {
502  for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
503  output[i] = reducer.initialize();
504  }
505  __syncthreads();
506  }
507 
508  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
509  const Index row = i / input_col_blocks;
510 
511  if (row < num_preserved_coeffs) {
512  const Index col_block = i % input_col_blocks;
513  const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
514 
515  Type reduced_val = reducer.initialize();
516 
517  for (Index j = 0; j < NumPerThread; j += unroll_times) {
518  const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
519  if (last_col >= num_coeffs_to_reduce) {
520  for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
521  const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
522  reducer.reduce(val, &reduced_val);
523  }
524  break;
525  } else {
526  // Faster version of the loop with no branches after unrolling.
527 #pragma unroll
528  for (int k = 0; k < unroll_times; ++k) {
529  const Index col = col_begin + blockDim.x * (j + k);
530  reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
531  }
532  }
533  }
534 
535 #pragma unroll
536  for (int offset = warpSize / 2; offset > 0; offset /= 2) {
537 #if defined(EIGEN_HIPCC)
538  // use std::is_floating_point to determine the type of reduced_val
539  // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error
540  // and list the float and int versions of __shfl_down as the candidate functions.
542  reducer.reduce(__shfl_down(static_cast<float>(reduced_val), offset), &reduced_val);
543  } else {
544  reducer.reduce(__shfl_down(static_cast<int>(reduced_val), offset), &reduced_val);
545  }
546 #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
547  reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
548 #else
549  reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val);
550 #endif
551  }
552 
553  if ((threadIdx.x & (warpSize - 1)) == 0) {
554  atomicReduce(&(output[row]), reduced_val, reducer);
555  }
556  }
557  }
558 #else // EIGEN_CUDA_ARCH >= 300
559  gpu_assert(0 && "Shouldn't be called on unsupported device");
560 #endif // EIGEN_CUDA_ARCH >= 300
561 }
562 
563 #ifdef EIGEN_HAS_GPU_FP16
564 
565 template <int NumPerThread, typename Self, typename Reducer, typename Index>
566 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reducer reducer, const Self input,
567  Index num_coeffs_to_reduce,
568  Index num_preserved_coeffs, half* output) {
569  eigen_assert(blockDim.y == 1);
570  eigen_assert(blockDim.z == 1);
571  eigen_assert(gridDim.y == 1);
572  eigen_assert(gridDim.z == 1);
573 
574  typedef typename packet_traits<Eigen::half>::type PacketType;
575  const int packet_width = unpacket_traits<PacketType>::size;
576  const int unroll_times = 16 / packet_width;
577  eigen_assert(NumPerThread % unroll_times == 0);
578  eigen_assert(unroll_times % 2 == 0);
579 
580  const Index input_col_blocks = numext::div_ceil<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2);
581  const Index num_input_blocks = numext::div_ceil<Index>(input_col_blocks * num_preserved_coeffs, 2);
582 
583  const Index num_threads = blockDim.x * gridDim.x;
584  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
585 
586  // Initialize the output values if they weren't initialized by the ReductionInitKernel
587  if (gridDim.x == 1) {
588  Index i = packet_width * thread_id;
589  for (; i + packet_width <= num_preserved_coeffs; i += packet_width * num_threads) {
590  PacketType* poutput = reinterpret_cast<PacketType*>(output + i);
591  *poutput = reducer.template initializePacket<PacketType>();
592  }
593  if (i < num_preserved_coeffs) {
594  output[i] = reducer.initialize();
595  }
596  __syncthreads();
597  }
598 
599  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
600  const Index row = 2 * (i / input_col_blocks); // everybody takes 2 rows
601 
602  if (row + 1 < num_preserved_coeffs) {
603  const Index col_block = i % input_col_blocks;
604  const Index col_begin = packet_width * (col_block * blockDim.x * NumPerThread + threadIdx.x);
605 
606  PacketType reduced_val1 = reducer.template initializePacket<PacketType>();
607  PacketType reduced_val2 = reducer.template initializePacket<PacketType>();
608 
609  for (Index j = 0; j < NumPerThread; j += unroll_times) {
610  const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1) * packet_width;
611  if (last_col >= num_coeffs_to_reduce) {
612  Index col = col_begin + blockDim.x * j;
613  for (; col + packet_width <= num_coeffs_to_reduce; col += blockDim.x) {
614  const PacketType val1 = input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col);
615  reducer.reducePacket(val1, &reduced_val1);
616  const PacketType val2 = input.m_impl.template packet<Unaligned>((row + 1) * num_coeffs_to_reduce + col);
617  reducer.reducePacket(val2, &reduced_val2);
618  }
619  if (col < num_coeffs_to_reduce) {
620  PacketType r1 = reducer.template initializePacket<PacketType>();
621  PacketType r2 = reducer.template initializePacket<PacketType>();
622  half2* hr1 = reinterpret_cast<half2*>(&r1);
623  half2* hr2 = reinterpret_cast<half2*>(&r2);
624  while (col + 1 < num_coeffs_to_reduce) {
625  *hr1 = __halves2half2(input.m_impl.coeff(row * num_coeffs_to_reduce + col),
626  input.m_impl.coeff(row * num_coeffs_to_reduce + col + 1));
627  *hr2 = __halves2half2(input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col),
628  input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col + 1));
629  hr1++;
630  hr2++;
631  col += 2;
632  }
633  if (col < num_coeffs_to_reduce) {
634  // Peel;
635  const half last1 = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
636  *hr1 = __halves2half2(last1, reducer.initialize());
637  const half last2 = input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col);
638  *hr2 = __halves2half2(last2, reducer.initialize());
639  }
640  reducer.reducePacket(r1, &reduced_val1);
641  reducer.reducePacket(r2, &reduced_val2);
642  }
643  break;
644  } else {
645  // Faster version of the loop with no branches after unrolling.
646 #pragma unroll
647  for (int k = 0; k < unroll_times; ++k) {
648  const Index col = col_begin + blockDim.x * (j + k) * packet_width;
649  reducer.reducePacket(input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col),
650  &reduced_val1);
651  reducer.reducePacket(input.m_impl.template packet<Unaligned>((row + 1) * num_coeffs_to_reduce + col),
652  &reduced_val2);
653  }
654  }
655  }
656 
657 #pragma unroll
658  for (int offset = warpSize / 2; offset > 0; offset /= 2) {
659 #if defined(EIGEN_HIPCC)
660  PacketType r1;
661  PacketType r2;
662  half2* hr1 = reinterpret_cast<half2*>(&r1);
663  half2* hr2 = reinterpret_cast<half2*>(&r2);
664  half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
665  half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
666  for (int i = 0; i < packet_width / 2; i++) {
667  // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
668  union {
669  int i;
670  half2 h;
671  } wka_in1, wka_out1;
672  wka_in1.h = rv1[i];
673  wka_out1.i = __shfl_down(wka_in1.i, offset, warpSize);
674  hr1[i] = wka_out1.h;
675 
676  union {
677  int i;
678  half2 h;
679  } wka_in2, wka_out2;
680  wka_in2.h = rv2[i];
681  wka_out2.i = __shfl_down(wka_in2.i, offset, warpSize);
682  hr2[i] = wka_out2.h;
683  }
684  reducer.reducePacket(r1, &reduced_val1);
685  reducer.reducePacket(r2, &reduced_val2);
686 #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
687  PacketType r1;
688  PacketType r2;
689  half2* hr1 = reinterpret_cast<half2*>(&r1);
690  half2* hr2 = reinterpret_cast<half2*>(&r2);
691  half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
692  half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
693  for (int i = 0; i < packet_width / 2; i++) {
694  hr1[i] = __shfl_down(rv1[i], offset, warpSize);
695  hr2[i] = __shfl_down(rv2[i], offset, warpSize);
696  }
697  reducer.reducePacket(r1, &reduced_val1);
698  reducer.reducePacket(r2, &reduced_val2);
699 #else
700  PacketType r1;
701  PacketType r2;
702  half2* hr1 = reinterpret_cast<half2*>(&r1);
703  half2* hr2 = reinterpret_cast<half2*>(&r2);
704  half2* rr1 = reinterpret_cast<half2*>(&reduced_val1);
705  half2* rr2 = reinterpret_cast<half2*>(&reduced_val2);
706  for (int j = 0; j < packet_width / 2; j++) {
707  hr1[j] = __shfl_down_sync(0xFFFFFFFF, rr1[j], (unsigned)offset, warpSize);
708  hr2[j] = __shfl_down_sync(0xFFFFFFFF, rr2[j], (unsigned)offset, warpSize);
709  }
710  reducer.reducePacket(r1, &reduced_val1);
711  reducer.reducePacket(r2, &reduced_val2);
712 
713 #endif
714  }
715  half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
716  half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
717  half2 val;
718  if (packet_width > 2) {
719  reducer.reducePacket(rv1[2], rv1);
720  reducer.reducePacket(rv1[3], rv1 + 1);
721  reducer.reducePacket(rv1[1], rv1);
722  reducer.reducePacket(rv2[2], rv2);
723  reducer.reducePacket(rv2[3], rv2 + 1);
724  reducer.reducePacket(rv2[1], rv2);
725  }
726  half val1 = __low2half(*rv1);
727  reducer.reduce(__high2half(*rv1), &val1);
728  half val2 = __low2half(*rv2);
729  reducer.reduce(__high2half(*rv2), &val2);
730  val = __halves2half2(val1, val2);
731  if ((threadIdx.x & (warpSize - 1)) == 0) {
732  half* loc = output + row;
733  atomicReduce(reinterpret_cast<half2*>(loc), val, reducer);
734  }
735  }
736  }
737 }
738 
739 #endif // EIGEN_HAS_GPU_FP16
740 
741 template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
742 struct InnerReductionLauncher {
743  static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index,
744  typename Self::Index) {
745  gpu_assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
746  return true;
747  }
748 };
749 
750 // Specialization for float and double
751 template <typename Self, typename Op, typename OutputType, bool PacketAccess>
752 struct InnerReductionLauncher<
753  Self, Op, OutputType, PacketAccess,
754  std::enable_if_t<internal::is_same<float, OutputType>::value || internal::is_same<double, OutputType>::value,
755  void>> {
756  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output,
757  typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
758  typedef typename Self::Index Index;
759 
760  const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
761  const int block_size = 256;
762  const int num_per_thread = 128;
763  const int dyn_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread);
764  const int max_blocks = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / block_size;
765  const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
766 
767  if (num_blocks > 1) {
768  // We initialize the outputs outside the reduction kernel when we can't be sure that there
769  // won't be a race conditions between multiple thread blocks.
770  const int dyn_blocks2 = numext::div_ceil<int>(num_preserved_vals, 1024);
771  const int max_blocks2 = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / 1024;
772  const int num_blocks2 = numext::mini<int>(max_blocks2, dyn_blocks2);
773  LAUNCH_GPU_KERNEL((ReductionInitKernel<OutputType, Index>), num_blocks2, 1024, 0, device, reducer.initialize(),
774  num_preserved_vals, output);
775  }
776 
777  LAUNCH_GPU_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>), num_blocks, block_size, 0, device,
778  reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
779 
780  return false;
781  }
782 };
783 
784 #ifdef EIGEN_HAS_GPU_FP16
785 template <typename Self, typename Op>
786 struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
787  static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
788  gpu_assert(false && "Should not be called since there is no packet accessor");
789  return true;
790  }
791 };
792 
793 template <typename Self, typename Op>
794 struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
795  static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output,
796  typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
797  typedef typename Self::Index Index;
798 
799  if (num_preserved_vals % 2 != 0) {
800  // Not supported yet, revert to the slower code path
801  return true;
802  }
803 
804  const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
805  const int block_size = /*256*/ 128;
806  const int num_per_thread = /*128*/ 64;
807  const int dyn_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread);
808  const int max_blocks = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / block_size;
809  const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
810 
811  if (num_blocks > 1) {
812  // We initialize the outputs outside the reduction kernel when we can't be sure that there
813  // won't be a race conditions between multiple thread blocks.
814  LAUNCH_GPU_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>), 1, 1, 0, device, reducer, self,
815  num_preserved_vals, output);
816  }
817 
818  LAUNCH_GPU_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>), num_blocks, block_size, 0,
819  device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
820 
821  return false;
822  }
823 };
824 #endif // EIGEN_HAS_GPU_FP16
825 
826 template <typename Self, typename Op>
827 struct InnerReducer<Self, Op, GpuDevice> {
828  // Unfortunately nvidia doesn't support well exotic types such as complex,
829  // so reduce the scope of the optimized version of the code to the simple case
830  // of floats and half floats.
831 #ifdef EIGEN_HAS_GPU_FP16
832  static constexpr bool HasOptimizedImplementation =
833  !Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
837 #else // EIGEN_HAS_GPU_FP16
838  static constexpr bool HasOptimizedImplementation =
839  !Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
841 #endif // EIGEN_HAS_GPU_FP16
842 
843  template <typename OutputType>
844  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output,
845  typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
846  gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
847  const Index num_coeffs = array_prod(self.m_impl.dimensions());
848  // Don't crash when we're called with an input tensor of size 0.
849  if (num_coeffs == 0) {
850  return true;
851  }
852  // It's faster to use the usual code.
853  if (num_coeffs_to_reduce <= 128) {
854  return true;
855  }
856 
857  return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(
858  self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
859  }
860 };
861 
862 template <int NumPerThread, typename Self, typename Reducer, typename Index>
863 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(Reducer reducer, const Self input,
864  Index num_coeffs_to_reduce,
865  Index num_preserved_coeffs,
866  typename Self::CoeffReturnType* output) {
867  const Index num_threads = blockDim.x * gridDim.x;
868  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
869  // Initialize the output values if they weren't initialized by the ReductionInitKernel
870  if (gridDim.x == 1) {
871  for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
872  output[i] = reducer.initialize();
873  }
874  __syncthreads();
875  }
876 
877  // Do the reduction.
878  const Index max_iter = num_preserved_coeffs * numext::div_ceil<Index>(num_coeffs_to_reduce, NumPerThread);
879  for (Index i = thread_id; i < max_iter; i += num_threads) {
880  const Index input_col = i % num_preserved_coeffs;
881  const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
882  typename Self::CoeffReturnType reduced_val = reducer.initialize();
883  const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
884  for (Index j = input_row; j < max_row; j++) {
885  typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col);
886  reducer.reduce(val, &reduced_val);
887  }
888  atomicReduce(&(output[input_col]), reduced_val, reducer);
889  }
890 }
891 
892 template <typename Self, typename Op>
893 struct OuterReducer<Self, Op, GpuDevice> {
894  // Unfortunately nvidia doesn't support well exotic types such as complex,
895  // so reduce the scope of the optimized version of the code to the simple case
896  // of floats.
897  static constexpr bool HasOptimizedImplementation =
898  !Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
900  template <typename Device, typename OutputType>
901  static
902 #if !defined(EIGEN_HIPCC)
903  // FIXME : leaving this EIGEN_DEVICE_FUNC in, results in the following runtime error
904  // (in the cxx11_tensor_reduction_gpu test)
905  //
906  // terminate called after throwing an instance of 'std::runtime_error'
907  // what(): No device code available for function: _ZN5Eigen8internal20OuterReductionKernelIL...
908  //
909  // don't know why this happens (and why is it a runtime error instead of a compile time error)
910  //
911  // this will be fixed by HIP PR#457
913 #endif
914  bool
915  run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
916  gpu_assert(false && "Should only be called to reduce doubles or floats on a gpu device");
917  return true;
918  }
919 
920  static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output,
921  typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
922  typedef typename Self::Index Index;
923 
924  // It's faster to use the usual code.
925  if (num_coeffs_to_reduce <= 32) {
926  return true;
927  }
928 
929  const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
930  const int block_size = 256;
931  const int num_per_thread = 16;
932  const int dyn_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread);
933  const int max_blocks = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / block_size;
934  const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
935 
936  if (num_blocks > 1) {
937  // We initialize the outputs in the reduction kernel itself when we don't have to worry
938  // about race conditions between multiple thread blocks.
939  const int dyn_blocks2 = numext::div_ceil<int>(num_preserved_vals, 1024);
940  const int max_blocks2 = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / 1024;
941  const int num_blocks2 = numext::mini<int>(max_blocks2, dyn_blocks2);
942  LAUNCH_GPU_KERNEL((ReductionInitKernel<float, Index>), num_blocks2, 1024, 0, device, reducer.initialize(),
943  num_preserved_vals, output);
944  }
945 
946  LAUNCH_GPU_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>), num_blocks, block_size, 0, device,
947  reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
948 
949  return false;
950  }
951 };
952 
953 #endif // defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
954 
955 } // end namespace internal
956 } // end namespace Eigen
957 
958 #endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
int i
Definition: BiCGSTAB_step_by_step.cpp:9
#define EIGEN_ALWAYS_INLINE
Definition: Macros.h:845
#define EIGEN_UNUSED_VARIABLE(var)
Definition: Macros.h:966
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:892
#define EIGEN_HIP_LAUNCH_BOUNDS_1024
Definition: Macros.h:576
#define eigen_assert(x)
Definition: Macros.h:910
m col(1)
m row(1)
@ R
Definition: StatisticsVector.h:21
m m block(1, 0, 2, 2)<< 4
Scalar Scalar int size
Definition: benchVecAdd.cpp:17
dim3 threadIdx
Definition: gpu_common.h:16
dim3 blockDim
Definition: gpu_common.h:16
dim3 blockIdx
Definition: gpu_common.h:16
static constexpr const last_t last
Definition: IndexedViewHelper.h:48
char char char int int * k
Definition: level2_impl.h:374
Eigen::Matrix< Scalar, Dynamic, Dynamic, ColMajor > tmp
Definition: level3_impl.h:365
constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE auto array_prod(const array< T, N > &arr) -> decltype(array_reduce< product_op, T, N >(arr, static_cast< T >(1)))
Definition: MoreMeta.h:497
EIGEN_DEVICE_FUNC void pstoreu(Scalar *to, const Packet &from)
Definition: GenericPacketMath.h:911
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
Definition: MathFunctions.h:920
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70
auto run(Kernel kernel, Args &&... args) -> decltype(kernel(args...))
Definition: gpu_test_helper.h:414
squared absolute value
Definition: GlobalFunctions.h:87
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:83
val
Definition: calibrate.py:119
Definition: Eigen_Colamd.h:49
Type
Type of JSON value.
Definition: rapidjson.h:513
void output(std::ostream &outfile, const unsigned &nplot)
Overload output function.
Definition: overloaded_element_body.h:490
static constexpr bool HasOptimizedImplementation
Definition: TensorReduction.h:357
static EIGEN_DEVICE_FUNC void run(const Self &self, Op &reducer, const Device &, typename Self::EvaluatorPointerType output)
Definition: TensorReduction.h:359
static constexpr bool HasOptimizedImplementation
Definition: TensorReduction.h:431
static EIGEN_DEVICE_FUNC bool run(const Self &, Op &, const Device &, typename Self::CoeffReturnType *, typename Self::Index, typename Self::Index)
Definition: TensorReduction.h:433
static constexpr bool HasOptimizedImplementation
Definition: TensorReduction.h:443
static EIGEN_DEVICE_FUNC bool run(const Self &, Op &, const Device &, typename Self::CoeffReturnType *, typename Self::Index, typename Self::Index)
Definition: TensorReduction.h:445
@ value
Definition: Meta.h:206
Packet8h type
Definition: AVX/PacketMath.h:161
@ PacketAccess
Definition: TensorFunctors.h:61
@ size
Definition: GenericPacketMath.h:139
std::ptrdiff_t j
Definition: tut_arithmetic_redux_minmax.cpp:2