10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
19 #if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
26 template <
typename T,
typename R>
28 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
30 unsigned int oldval = *
reinterpret_cast<unsigned int*
>(
output);
31 unsigned int newval = oldval;
32 reducer.reduce(accum,
reinterpret_cast<T*
>(&newval));
33 if (newval == oldval) {
36 unsigned int readback;
37 while ((readback = atomicCAS((
unsigned int*)
output, oldval, newval)) != oldval) {
40 reducer.reduce(accum,
reinterpret_cast<T*
>(&newval));
41 if (newval == oldval) {
45 }
else if (
sizeof(
T) == 8) {
46 unsigned long long oldval = *
reinterpret_cast<unsigned long long*
>(
output);
47 unsigned long long newval = oldval;
48 reducer.reduce(accum,
reinterpret_cast<T*
>(&newval));
49 if (newval == oldval) {
52 unsigned long long readback;
53 while ((readback = atomicCAS(
reinterpret_cast<unsigned long long*
>(
output), oldval, newval)) != oldval) {
56 reducer.reduce(accum,
reinterpret_cast<T*
>(&newval));
57 if (newval == oldval) {
62 gpu_assert(0 &&
"Wordsize not supported");
68 gpu_assert(0 &&
"Shouldn't be called on unsupported device");
73 template <
typename Type>
75 return atomicExch(address,
val);
79 __device__
inline double atomicExchCustom(
double* address,
double val) {
80 unsigned long long int* address_as_ull =
reinterpret_cast<unsigned long long int*
>(address);
81 return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(
val)));
84 #ifdef EIGEN_HAS_GPU_FP16
86 __device__
inline void atomicReduce(half2*
output, half2 accum,
R& reducer) {
87 unsigned int oldval = *
reinterpret_cast<unsigned int*
>(
output);
88 unsigned int newval = oldval;
89 reducer.reducePacket(accum,
reinterpret_cast<half2*
>(&newval));
90 if (newval == oldval) {
93 unsigned int readback;
94 while ((readback = atomicCAS((
unsigned int*)
output, oldval, newval)) != oldval) {
97 reducer.reducePacket(accum,
reinterpret_cast<half2*
>(&newval));
98 if (newval == oldval) {
103 #ifdef EIGEN_GPU_COMPILE_PHASE
105 template <
typename R>
106 __device__
inline void atomicReduce(Packet4h2*
output, Packet4h2 accum,
R& reducer) {
107 half2* houtput =
reinterpret_cast<half2*
>(
output);
108 half2* haccum =
reinterpret_cast<half2*
>(&accum);
109 for (
int i = 0;
i < 4; ++
i) {
110 atomicReduce(houtput +
i, *(haccum +
i), reducer);
117 __device__
inline void atomicReduce(
float*
output,
float accum, SumReducer<float>&) {
118 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
123 gpu_assert(0 &&
"Shouldn't be called on unsupported device");
127 template <
typename CoeffType,
typename Index>
132 for (
Index i = thread_id;
i < num_preserved_coeffs;
i += num_threads) {
137 template <
int BlockSize,
int NumPerThread,
typename Self,
typename Reducer,
typename Index>
139 typename Self::CoeffReturnType*
output,
140 unsigned int* semaphore) {
141 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
144 if (gridDim.x == 1) {
145 if (first_index == 0) {
146 *
output = reducer.initialize();
150 unsigned int block = atomicCAS(semaphore, 0u, 1u);
153 atomicExchCustom(
output, reducer.initialize());
155 atomicExch(semaphore, 2u);
161 val = atomicCAS(semaphore, 2u, 2u);
171 typename Self::CoeffReturnType accum = reducer.initialize();
172 Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread * BlockSize);
173 for (
Index i = 0;
i < max_iter;
i += BlockSize) {
174 const Index index = first_index +
i;
176 typename Self::CoeffReturnType
val = input.m_impl.coeff(index);
177 reducer.reduce(
val, &accum);
181 for (
int offset = warpSize / 2; offset > 0; offset /= 2) {
182 #if defined(EIGEN_HIPCC)
187 reducer.reduce(__shfl_down(
static_cast<float>(accum), offset, warpSize), &accum);
189 reducer.reduce(__shfl_down(
static_cast<int>(accum), offset, warpSize), &accum);
191 #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
192 reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
194 reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum);
198 if ((
threadIdx.x & (warpSize - 1)) == 0) {
199 atomicReduce(
output, accum, reducer);
204 atomicInc(semaphore, gridDim.x + 1);
205 #if defined(EIGEN_HIPCC)
206 __threadfence_system();
215 gpu_assert(0 &&
"Shouldn't be called on unsupported device");
219 #ifdef EIGEN_HAS_GPU_FP16
220 template <
typename Self,
typename Reducer,
typename Index>
222 Index num_coeffs, half* scratch) {
227 if (packet_remainder != 0) {
228 half2* h2scratch =
reinterpret_cast<half2*
>(scratch);
229 for (
Index i = num_coeffs - packet_remainder;
i + 2 <= num_coeffs;
i += 2) {
230 *h2scratch = __halves2half2(input.coeff(
i), input.coeff(
i + 1));
233 if ((num_coeffs & 1) != 0) {
234 half lastCoeff = input.coeff(num_coeffs - 1);
235 *h2scratch = __halves2half2(lastCoeff, reducer.initialize());
238 packet_type reduce = reducer.template initializePacket<packet_type>();
243 template <
typename Self,
typename Reducer,
typename Index>
251 PacketType* p_output =
reinterpret_cast<PacketType*
>(
output);
252 for (
Index i = thread_id;
i < num_packets;
i += num_threads) {
253 p_output[
i] = reducer.template initializePacket<PacketType>();
256 if (thread_id < packet_remainder) {
257 output[num_coeffs - packet_remainder + thread_id] = reducer.initialize();
261 template <
int BlockSize,
int NumPerThread,
typename Self,
typename Reducer,
typename Index>
272 if (gridDim.x == 1) {
273 if (first_index == 0) {
274 int rem = num_coeffs % packet_width;
276 half2* p_scratch =
reinterpret_cast<half2*
>(scratch);
277 pstoreu(scratch, reducer.template initializePacket<PacketType>());
278 for (
int i = 0;
i < rem / 2;
i++) {
279 *p_scratch = __halves2half2(input.coeff(num_coeffs - packet_width + 2 *
i),
280 input.coeff(num_coeffs - packet_width + 2 *
i + 1));
283 if ((num_coeffs & 1) != 0) {
284 half
last = input.coeff(num_coeffs - 1);
285 *p_scratch = __halves2half2(
last, reducer.initialize());
288 PacketType reduce = reducer.template initializePacket<PacketType>();
295 PacketType accum = reducer.template initializePacket<PacketType>();
296 const Index max_iter =
297 numext::mini<Index>((num_coeffs - first_index) / packet_width, NumPerThread * BlockSize / packet_width);
298 for (
Index i = 0;
i < max_iter;
i += BlockSize) {
299 const Index index = first_index + packet_width *
i;
301 PacketType
val = input.template packet<Unaligned>(index);
302 reducer.reducePacket(
val, &accum);
306 for (
int offset = warpSize / 2; offset > 0; offset /= 2) {
307 #if defined(EIGEN_HIPCC)
309 half2* hr =
reinterpret_cast<half2*
>(&r1);
310 half2* hacc =
reinterpret_cast<half2*
>(&accum);
311 for (
int i = 0;
i < packet_width / 2;
i++) {
318 wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
321 reducer.reducePacket(r1, &accum);
322 #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
324 half2* hr =
reinterpret_cast<half2*
>(&r1);
325 half2* hacc =
reinterpret_cast<half2*
>(&accum);
326 for (
int i = 0;
i < packet_width / 2;
i++) {
327 hr[
i] = __shfl_down(hacc[
i], offset, warpSize);
329 reducer.reducePacket(r1, &accum);
332 half2* hr =
reinterpret_cast<half2*
>(&r1);
333 half2* hacc =
reinterpret_cast<half2*
>(&accum);
334 for (
int i = 0;
i < packet_width / 2;
i++) {
335 hr[
i] = __shfl_down_sync(0xFFFFFFFF, hacc[
i], (
unsigned)offset, warpSize);
337 reducer.reducePacket(r1, &accum);
342 if ((
threadIdx.x & (warpSize - 1)) == 0) {
343 atomicReduce(
reinterpret_cast<PacketType*
>(scratch), accum, reducer);
347 half2* rv1 =
reinterpret_cast<half2*
>(scratch);
348 if (packet_width > 2) {
349 reducer.reducePacket(rv1[2], rv1);
350 reducer.reducePacket(rv1[3], rv1 + 1);
351 reducer.reducePacket(rv1[1], rv1);
353 if (gridDim.x == 1) {
354 if (first_index == 0) {
355 half
tmp = __low2half(*rv1);
356 reducer.reduce(__high2half(*rv1), &
tmp);
362 template <
typename Op>
369 half2* pscratch =
reinterpret_cast<half2*
>(scratch);
370 half
tmp = __float2half(0.f);
372 reducer.reduce(__low2half(*pscratch), &
tmp);
373 reducer.reduce(__high2half(*pscratch), &
tmp);
382 template <
typename Self,
typename Op,
typename OutputType,
bool PacketAccess,
typename Enabled =
void>
383 struct FullReductionLauncher {
384 static void run(
const Self&, Op&,
const GpuDevice&, OutputType*,
typename Self::Index) {
385 gpu_assert(
false &&
"Should only be called on doubles, floats and half floats");
390 template <
typename Self,
typename Op,
typename OutputType,
bool PacketAccess>
391 struct FullReductionLauncher<
392 Self, Op, OutputType, PacketAccess,
393 std::enable_if_t<internal::is_same<float, OutputType>::value || internal::is_same<double, OutputType>::value,
395 static void run(
const Self&
self, Op& reducer,
const GpuDevice& device, OutputType*
output,
398 const int block_size = 256;
399 const int num_per_thread = 128;
400 const int num_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread);
402 unsigned int* semaphore = NULL;
403 if (num_blocks > 1) {
404 semaphore = device.semaphore();
407 LAUNCH_GPU_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>), num_blocks, block_size, 0,
408 device, reducer,
self, num_coeffs,
output, semaphore);
412 #ifdef EIGEN_HAS_GPU_FP16
413 template <
typename Self,
typename Op>
414 struct FullReductionLauncher<Self, Op,
Eigen::half, false> {
415 static void run(
const Self&, Op&,
const GpuDevice&, half*,
typename Self::Index) {
416 gpu_assert(
false &&
"Should not be called since there is no packet accessor");
420 template <
typename Self,
typename Op>
421 struct FullReductionLauncher<Self, Op,
Eigen::half, true> {
422 static void run(
const Self&
self, Op& reducer,
const GpuDevice& device, half*
output,
426 const int block_size = 256;
427 const int num_per_thread = 128;
428 const int num_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread);
429 half* scratch =
static_cast<half*
>(device.scratchpad());
431 if (num_blocks > 1) {
434 LAUNCH_GPU_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>), 1, 1, 0, device, reducer,
self,
435 num_coeffs, scratch);
438 LAUNCH_GPU_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>), num_blocks,
439 block_size, 0, device, reducer,
self, num_coeffs,
output, scratch);
441 if (num_blocks > 1) {
442 LAUNCH_GPU_KERNEL((ReductionCleanupKernelHalfFloat<Op>), 1, 1, 0, device, reducer,
output, scratch);
448 template <
typename Self,
typename Op,
bool Vectorizable>
449 struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
453 #ifdef EIGEN_HAS_GPU_FP16
465 template <
typename OutputType>
466 static void run(
const Self&
self, Op& reducer,
const GpuDevice& device, OutputType*
output) {
470 if (num_coeffs == 0) {
474 FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>
::run(
self, reducer, device,
479 template <
int NumPerThread,
typename Self,
typename Reducer,
typename Index>
481 Index num_coeffs_to_reduce,
482 Index num_preserved_coeffs,
483 typename Self::CoeffReturnType*
output) {
484 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
485 typedef typename Self::CoeffReturnType
Type;
491 const int unroll_times = 16;
494 const Index input_col_blocks = numext::div_ceil<Index>(num_coeffs_to_reduce,
blockDim.x * NumPerThread);
495 const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
501 if (gridDim.x == 1) {
502 for (
Index i = thread_id;
i < num_preserved_coeffs;
i += num_threads) {
503 output[
i] = reducer.initialize();
511 if (
row < num_preserved_coeffs) {
512 const Index col_block =
i % input_col_blocks;
515 Type reduced_val = reducer.initialize();
517 for (
Index j = 0;
j < NumPerThread;
j += unroll_times) {
518 const Index last_col = col_begin +
blockDim.x * (
j + unroll_times - 1);
519 if (last_col >= num_coeffs_to_reduce) {
521 const Type val = input.m_impl.coeff(
row * num_coeffs_to_reduce +
col);
522 reducer.reduce(
val, &reduced_val);
528 for (
int k = 0;
k < unroll_times; ++
k) {
530 reducer.reduce(input.m_impl.coeff(
row * num_coeffs_to_reduce +
col), &reduced_val);
536 for (
int offset = warpSize / 2; offset > 0; offset /= 2) {
537 #if defined(EIGEN_HIPCC)
542 reducer.reduce(__shfl_down(
static_cast<float>(reduced_val), offset), &reduced_val);
544 reducer.reduce(__shfl_down(
static_cast<int>(reduced_val), offset), &reduced_val);
546 #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
547 reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
549 reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val);
553 if ((
threadIdx.x & (warpSize - 1)) == 0) {
554 atomicReduce(&(
output[
row]), reduced_val, reducer);
559 gpu_assert(0 &&
"Shouldn't be called on unsupported device");
563 #ifdef EIGEN_HAS_GPU_FP16
565 template <
int NumPerThread,
typename Self,
typename Reducer,
typename Index>
567 Index num_coeffs_to_reduce,
576 const int unroll_times = 16 / packet_width;
580 const Index input_col_blocks = numext::div_ceil<Index>(num_coeffs_to_reduce,
blockDim.x * NumPerThread * 2);
581 const Index num_input_blocks = numext::div_ceil<Index>(input_col_blocks * num_preserved_coeffs, 2);
587 if (gridDim.x == 1) {
588 Index i = packet_width * thread_id;
589 for (;
i + packet_width <= num_preserved_coeffs;
i += packet_width * num_threads) {
590 PacketType* poutput =
reinterpret_cast<PacketType*
>(
output +
i);
591 *poutput = reducer.template initializePacket<PacketType>();
593 if (
i < num_preserved_coeffs) {
594 output[
i] = reducer.initialize();
600 const Index row = 2 * (
i / input_col_blocks);
602 if (
row + 1 < num_preserved_coeffs) {
603 const Index col_block =
i % input_col_blocks;
606 PacketType reduced_val1 = reducer.template initializePacket<PacketType>();
607 PacketType reduced_val2 = reducer.template initializePacket<PacketType>();
609 for (
Index j = 0;
j < NumPerThread;
j += unroll_times) {
610 const Index last_col = col_begin +
blockDim.x * (
j + unroll_times - 1) * packet_width;
611 if (last_col >= num_coeffs_to_reduce) {
613 for (;
col + packet_width <= num_coeffs_to_reduce;
col +=
blockDim.x) {
614 const PacketType val1 = input.m_impl.template packet<Unaligned>(
row * num_coeffs_to_reduce +
col);
615 reducer.reducePacket(val1, &reduced_val1);
616 const PacketType val2 = input.m_impl.template packet<Unaligned>((
row + 1) * num_coeffs_to_reduce +
col);
617 reducer.reducePacket(val2, &reduced_val2);
619 if (
col < num_coeffs_to_reduce) {
620 PacketType r1 = reducer.template initializePacket<PacketType>();
621 PacketType r2 = reducer.template initializePacket<PacketType>();
622 half2* hr1 =
reinterpret_cast<half2*
>(&r1);
623 half2* hr2 =
reinterpret_cast<half2*
>(&r2);
624 while (
col + 1 < num_coeffs_to_reduce) {
625 *hr1 = __halves2half2(input.m_impl.coeff(
row * num_coeffs_to_reduce +
col),
626 input.m_impl.coeff(
row * num_coeffs_to_reduce +
col + 1));
627 *hr2 = __halves2half2(input.m_impl.coeff((
row + 1) * num_coeffs_to_reduce +
col),
628 input.m_impl.coeff((
row + 1) * num_coeffs_to_reduce +
col + 1));
633 if (
col < num_coeffs_to_reduce) {
635 const half last1 = input.m_impl.coeff(
row * num_coeffs_to_reduce +
col);
636 *hr1 = __halves2half2(last1, reducer.initialize());
637 const half last2 = input.m_impl.coeff((
row + 1) * num_coeffs_to_reduce +
col);
638 *hr2 = __halves2half2(last2, reducer.initialize());
640 reducer.reducePacket(r1, &reduced_val1);
641 reducer.reducePacket(r2, &reduced_val2);
647 for (
int k = 0;
k < unroll_times; ++
k) {
649 reducer.reducePacket(input.m_impl.template packet<Unaligned>(
row * num_coeffs_to_reduce +
col),
651 reducer.reducePacket(input.m_impl.template packet<Unaligned>((
row + 1) * num_coeffs_to_reduce +
col),
658 for (
int offset = warpSize / 2; offset > 0; offset /= 2) {
659 #if defined(EIGEN_HIPCC)
662 half2* hr1 =
reinterpret_cast<half2*
>(&r1);
663 half2* hr2 =
reinterpret_cast<half2*
>(&r2);
664 half2* rv1 =
reinterpret_cast<half2*
>(&reduced_val1);
665 half2* rv2 =
reinterpret_cast<half2*
>(&reduced_val2);
666 for (
int i = 0;
i < packet_width / 2;
i++) {
673 wka_out1.i = __shfl_down(wka_in1.i, offset, warpSize);
681 wka_out2.i = __shfl_down(wka_in2.i, offset, warpSize);
684 reducer.reducePacket(r1, &reduced_val1);
685 reducer.reducePacket(r2, &reduced_val2);
686 #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
689 half2* hr1 =
reinterpret_cast<half2*
>(&r1);
690 half2* hr2 =
reinterpret_cast<half2*
>(&r2);
691 half2* rv1 =
reinterpret_cast<half2*
>(&reduced_val1);
692 half2* rv2 =
reinterpret_cast<half2*
>(&reduced_val2);
693 for (
int i = 0;
i < packet_width / 2;
i++) {
694 hr1[
i] = __shfl_down(rv1[
i], offset, warpSize);
695 hr2[
i] = __shfl_down(rv2[
i], offset, warpSize);
697 reducer.reducePacket(r1, &reduced_val1);
698 reducer.reducePacket(r2, &reduced_val2);
702 half2* hr1 =
reinterpret_cast<half2*
>(&r1);
703 half2* hr2 =
reinterpret_cast<half2*
>(&r2);
704 half2* rr1 =
reinterpret_cast<half2*
>(&reduced_val1);
705 half2* rr2 =
reinterpret_cast<half2*
>(&reduced_val2);
706 for (
int j = 0;
j < packet_width / 2;
j++) {
707 hr1[
j] = __shfl_down_sync(0xFFFFFFFF, rr1[
j], (
unsigned)offset, warpSize);
708 hr2[
j] = __shfl_down_sync(0xFFFFFFFF, rr2[
j], (
unsigned)offset, warpSize);
710 reducer.reducePacket(r1, &reduced_val1);
711 reducer.reducePacket(r2, &reduced_val2);
715 half2* rv1 =
reinterpret_cast<half2*
>(&reduced_val1);
716 half2* rv2 =
reinterpret_cast<half2*
>(&reduced_val2);
718 if (packet_width > 2) {
719 reducer.reducePacket(rv1[2], rv1);
720 reducer.reducePacket(rv1[3], rv1 + 1);
721 reducer.reducePacket(rv1[1], rv1);
722 reducer.reducePacket(rv2[2], rv2);
723 reducer.reducePacket(rv2[3], rv2 + 1);
724 reducer.reducePacket(rv2[1], rv2);
726 half val1 = __low2half(*rv1);
727 reducer.reduce(__high2half(*rv1), &val1);
728 half val2 = __low2half(*rv2);
729 reducer.reduce(__high2half(*rv2), &val2);
730 val = __halves2half2(val1, val2);
731 if ((
threadIdx.x & (warpSize - 1)) == 0) {
733 atomicReduce(
reinterpret_cast<half2*
>(loc),
val, reducer);
741 template <
typename Self,
typename Op,
typename OutputType,
bool PacketAccess,
typename Enabled =
void>
742 struct InnerReductionLauncher {
745 gpu_assert(
false &&
"Should only be called to reduce doubles, floats and half floats on a gpu device");
751 template <
typename Self,
typename Op,
typename OutputType,
bool PacketAccess>
752 struct InnerReductionLauncher<
753 Self, Op, OutputType, PacketAccess,
754 std::enable_if_t<internal::is_same<float, OutputType>::value || internal::is_same<double, OutputType>::value,
756 static bool run(
const Self&
self, Op& reducer,
const GpuDevice& device, OutputType*
output,
760 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
761 const int block_size = 256;
762 const int num_per_thread = 128;
763 const int dyn_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread);
764 const int max_blocks = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / block_size;
765 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
767 if (num_blocks > 1) {
770 const int dyn_blocks2 = numext::div_ceil<int>(num_preserved_vals, 1024);
771 const int max_blocks2 = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / 1024;
772 const int num_blocks2 = numext::mini<int>(max_blocks2, dyn_blocks2);
773 LAUNCH_GPU_KERNEL((ReductionInitKernel<OutputType, Index>), num_blocks2, 1024, 0, device, reducer.initialize(),
774 num_preserved_vals,
output);
777 LAUNCH_GPU_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>), num_blocks, block_size, 0, device,
778 reducer,
self, num_coeffs_to_reduce, num_preserved_vals,
output);
784 #ifdef EIGEN_HAS_GPU_FP16
785 template <
typename Self,
typename Op>
786 struct InnerReductionLauncher<Self, Op,
Eigen::half, false> {
788 gpu_assert(
false &&
"Should not be called since there is no packet accessor");
793 template <
typename Self,
typename Op>
794 struct InnerReductionLauncher<Self, Op,
Eigen::half, true> {
795 static bool run(
const Self&
self, Op& reducer,
const GpuDevice& device, half*
output,
799 if (num_preserved_vals % 2 != 0) {
804 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
805 const int block_size = 128;
806 const int num_per_thread = 64;
807 const int dyn_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread);
808 const int max_blocks = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / block_size;
809 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
811 if (num_blocks > 1) {
814 LAUNCH_GPU_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>), 1, 1, 0, device, reducer,
self,
815 num_preserved_vals,
output);
818 LAUNCH_GPU_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>), num_blocks, block_size, 0,
819 device, reducer,
self, num_coeffs_to_reduce, num_preserved_vals,
output);
826 template <
typename Self,
typename Op>
827 struct InnerReducer<Self, Op, GpuDevice> {
831 #ifdef EIGEN_HAS_GPU_FP16
843 template <
typename OutputType>
844 static bool run(
const Self&
self, Op& reducer,
const GpuDevice& device, OutputType*
output,
849 if (num_coeffs == 0) {
853 if (num_coeffs_to_reduce <= 128) {
857 return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>
::run(
858 self, reducer, device,
output, num_coeffs_to_reduce, num_preserved_vals);
862 template <
int NumPerThread,
typename Self,
typename Reducer,
typename Index>
864 Index num_coeffs_to_reduce,
865 Index num_preserved_coeffs,
866 typename Self::CoeffReturnType*
output) {
870 if (gridDim.x == 1) {
871 for (
Index i = thread_id;
i < num_preserved_coeffs;
i += num_threads) {
872 output[
i] = reducer.initialize();
878 const Index max_iter = num_preserved_coeffs * numext::div_ceil<Index>(num_coeffs_to_reduce, NumPerThread);
879 for (
Index i = thread_id;
i < max_iter;
i += num_threads) {
880 const Index input_col =
i % num_preserved_coeffs;
881 const Index input_row = (
i / num_preserved_coeffs) * NumPerThread;
882 typename Self::CoeffReturnType reduced_val = reducer.initialize();
883 const Index max_row =
numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
884 for (
Index j = input_row;
j < max_row;
j++) {
885 typename Self::CoeffReturnType
val = input.m_impl.coeff(
j * num_preserved_coeffs + input_col);
886 reducer.reduce(
val, &reduced_val);
888 atomicReduce(&(
output[input_col]), reduced_val, reducer);
892 template <
typename Self,
typename Op>
893 struct OuterReducer<Self, Op, GpuDevice> {
900 template <
typename Device,
typename OutputType>
902 #if !defined(EIGEN_HIPCC)
916 gpu_assert(
false &&
"Should only be called to reduce doubles or floats on a gpu device");
920 static bool run(
const Self&
self, Op& reducer,
const GpuDevice& device,
float*
output,
925 if (num_coeffs_to_reduce <= 32) {
929 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
930 const int block_size = 256;
931 const int num_per_thread = 16;
932 const int dyn_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread);
933 const int max_blocks = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / block_size;
934 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
936 if (num_blocks > 1) {
939 const int dyn_blocks2 = numext::div_ceil<int>(num_preserved_vals, 1024);
940 const int max_blocks2 = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / 1024;
941 const int num_blocks2 = numext::mini<int>(max_blocks2, dyn_blocks2);
942 LAUNCH_GPU_KERNEL((ReductionInitKernel<float, Index>), num_blocks2, 1024, 0, device, reducer.initialize(),
943 num_preserved_vals,
output);
946 LAUNCH_GPU_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>), num_blocks, block_size, 0, device,
947 reducer,
self, num_coeffs_to_reduce, num_preserved_vals,
output);
int i
Definition: BiCGSTAB_step_by_step.cpp:9
#define EIGEN_ALWAYS_INLINE
Definition: Macros.h:845
#define EIGEN_UNUSED_VARIABLE(var)
Definition: Macros.h:966
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:892
#define EIGEN_HIP_LAUNCH_BOUNDS_1024
Definition: Macros.h:576
#define eigen_assert(x)
Definition: Macros.h:910
@ R
Definition: StatisticsVector.h:21
m m block(1, 0, 2, 2)<< 4
Scalar Scalar int size
Definition: benchVecAdd.cpp:17
dim3 threadIdx
Definition: gpu_common.h:16
dim3 blockDim
Definition: gpu_common.h:16
dim3 blockIdx
Definition: gpu_common.h:16
static constexpr const last_t last
Definition: IndexedViewHelper.h:48
char char char int int * k
Definition: level2_impl.h:374
Eigen::Matrix< Scalar, Dynamic, Dynamic, ColMajor > tmp
Definition: level3_impl.h:365
constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE auto array_prod(const array< T, N > &arr) -> decltype(array_reduce< product_op, T, N >(arr, static_cast< T >(1)))
Definition: MoreMeta.h:497
EIGEN_DEVICE_FUNC void pstoreu(Scalar *to, const Packet &from)
Definition: GenericPacketMath.h:911
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
Definition: MathFunctions.h:920
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70
auto run(Kernel kernel, Args &&... args) -> decltype(kernel(args...))
Definition: gpu_test_helper.h:414
squared absolute value
Definition: GlobalFunctions.h:87
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:83
val
Definition: calibrate.py:119
Definition: Eigen_Colamd.h:49
Type
Type of JSON value.
Definition: rapidjson.h:513
void output(std::ostream &outfile, const unsigned &nplot)
Overload output function.
Definition: overloaded_element_body.h:490
static constexpr bool HasOptimizedImplementation
Definition: TensorReduction.h:357
static EIGEN_DEVICE_FUNC void run(const Self &self, Op &reducer, const Device &, typename Self::EvaluatorPointerType output)
Definition: TensorReduction.h:359
static constexpr bool HasOptimizedImplementation
Definition: TensorReduction.h:431
static EIGEN_DEVICE_FUNC bool run(const Self &, Op &, const Device &, typename Self::CoeffReturnType *, typename Self::Index, typename Self::Index)
Definition: TensorReduction.h:433
static constexpr bool HasOptimizedImplementation
Definition: TensorReduction.h:443
static EIGEN_DEVICE_FUNC bool run(const Self &, Op &, const Device &, typename Self::CoeffReturnType *, typename Self::Index, typename Self::Index)
Definition: TensorReduction.h:445
@ value
Definition: Meta.h:206
Packet8h type
Definition: AVX/PacketMath.h:161
@ PacketAccess
Definition: TensorFunctors.h:61
@ size
Definition: GenericPacketMath.h:139
std::ptrdiff_t j
Definition: tut_arithmetic_redux_minmax.cpp:2