32#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
33#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
36#if !defined(__HIPCC_RTC__)
40namespace cooperative_groups {
62 __CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t size =
static_cast<uint64_t
>(0),
63 uint64_t mask =
static_cast<uint64_t
>(0)) {
72 unsigned int meta_group_rank;
73 unsigned int meta_group_size;
76 struct _coalesced_info {
77 lane_mask member_mask;
79 struct _tiled_info tiled_info;
82 friend __CG_QUALIFIER__ thread_group this_thread();
83 friend __CG_QUALIFIER__ thread_group tiled_partition(
const thread_group& parent,
84 unsigned int tile_size);
85 friend class thread_block;
91 __CG_QUALIFIER__ uint32_t size()
const {
return _size; }
92 __CG_QUALIFIER__
unsigned int cg_type()
const {
return _type; }
94 __CG_QUALIFIER__ uint32_t thread_rank()
const;
96 __CG_QUALIFIER__
bool is_valid()
const;
98 __CG_QUALIFIER__
void sync()
const;
123class multi_grid_group :
public thread_group {
126 friend __CG_QUALIFIER__ multi_grid_group this_multi_grid();
130 explicit __CG_QUALIFIER__ multi_grid_group(uint32_t size)
131 : thread_group(internal::cg_multi_grid, size) {}
136 __CG_QUALIFIER__ uint32_t num_grids() {
return internal::multi_grid::num_grids(); }
139 __CG_QUALIFIER__ uint32_t grid_rank() {
return internal::multi_grid::grid_rank(); }
140 __CG_QUALIFIER__ uint32_t thread_rank()
const {
return internal::multi_grid::thread_rank(); }
141 __CG_QUALIFIER__
bool is_valid()
const {
return internal::multi_grid::is_valid(); }
142 __CG_QUALIFIER__
void sync()
const { internal::multi_grid::sync(); }
154__CG_QUALIFIER__ multi_grid_group this_multi_grid() {
155 return multi_grid_group(internal::multi_grid::size());
166class grid_group :
public thread_group {
169 friend __CG_QUALIFIER__ grid_group this_grid();
173 explicit __CG_QUALIFIER__ grid_group(uint32_t size) : thread_group(internal::cg_grid, size) {}
176 __CG_QUALIFIER__ uint32_t thread_rank()
const {
return internal::grid::thread_rank(); }
177 __CG_QUALIFIER__
bool is_valid()
const {
return internal::grid::is_valid(); }
178 __CG_QUALIFIER__
void sync()
const { internal::grid::sync(); }
179 __CG_QUALIFIER__ dim3 group_dim()
const {
return internal::workgroup::block_dim(); }
191__CG_QUALIFIER__ grid_group this_grid() {
return grid_group(internal::grid::size()); }
202class thread_block :
public thread_group {
205 friend __CG_QUALIFIER__ thread_block this_thread_block();
206 friend __CG_QUALIFIER__ thread_group tiled_partition(
const thread_group& parent,
207 unsigned int tile_size);
208 friend __CG_QUALIFIER__ thread_group tiled_partition(
const thread_block& parent,
209 unsigned int tile_size);
212 explicit __CG_QUALIFIER__ thread_block(uint32_t size)
213 : thread_group(internal::cg_workgroup, size) {}
215 __CG_QUALIFIER__ thread_group new_tiled_group(
unsigned int tile_size)
const {
216 const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
218 if (!tile_size || (tile_size > __AMDGCN_WAVEFRONT_SIZE) || !pow2) {
219 __hip_assert(
false &&
"invalid tile size");
222 auto block_size = size();
223 auto rank = thread_rank();
224 auto partitions = (block_size + tile_size - 1) / tile_size;
225 auto tail = (partitions * tile_size) - block_size;
226 auto partition_size = tile_size - tail * (rank >= (partitions - 1) * tile_size);
227 thread_group tiledGroup = thread_group(internal::cg_tiled_group, partition_size);
229 tiledGroup.coalesced_info.tiled_info.size = tile_size;
230 tiledGroup.coalesced_info.tiled_info.is_tiled =
true;
231 tiledGroup.coalesced_info.tiled_info.meta_group_rank = rank / tile_size;
232 tiledGroup.coalesced_info.tiled_info.meta_group_size = partitions;
238 __CG_STATIC_QUALIFIER__ dim3 group_index() {
return internal::workgroup::group_index(); }
240 __CG_STATIC_QUALIFIER__ dim3 thread_index() {
return internal::workgroup::thread_index(); }
241 __CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
return internal::workgroup::thread_rank(); }
242 __CG_STATIC_QUALIFIER__ uint32_t size() {
return internal::workgroup::size(); }
243 __CG_STATIC_QUALIFIER__
bool is_valid() {
return internal::workgroup::is_valid(); }
244 __CG_STATIC_QUALIFIER__
void sync() { internal::workgroup::sync(); }
245 __CG_QUALIFIER__ dim3 group_dim() {
return internal::workgroup::block_dim(); }
257__CG_QUALIFIER__ thread_block this_thread_block() {
258 return thread_block(internal::workgroup::size());
269class tiled_group :
public thread_group {
271 friend __CG_QUALIFIER__ thread_group tiled_partition(
const thread_group& parent,
272 unsigned int tile_size);
273 friend __CG_QUALIFIER__ tiled_group tiled_partition(
const tiled_group& parent,
274 unsigned int tile_size);
276 __CG_QUALIFIER__ tiled_group new_tiled_group(
unsigned int tile_size)
const {
277 const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
279 if (!tile_size || (tile_size > __AMDGCN_WAVEFRONT_SIZE) || !pow2) {
280 __hip_assert(
false &&
"invalid tile size");
283 if (size() <= tile_size) {
287 tiled_group tiledGroup = tiled_group(tile_size);
288 tiledGroup.coalesced_info.tiled_info.is_tiled =
true;
293 explicit __CG_QUALIFIER__ tiled_group(
unsigned int tileSize)
294 : thread_group(internal::cg_tiled_group, tileSize) {
295 coalesced_info.tiled_info.size = tileSize;
296 coalesced_info.tiled_info.is_tiled =
true;
300 __CG_QUALIFIER__
unsigned int size()
const {
return (coalesced_info.tiled_info.size); }
302 __CG_QUALIFIER__
unsigned int thread_rank()
const {
303 return (internal::workgroup::thread_rank() & (coalesced_info.tiled_info.size - 1));
306 __CG_QUALIFIER__
void sync()
const {
307 internal::tiled_group::sync();
311template <
unsigned int size,
class ParentCGTy>
class thread_block_tile;
320class coalesced_group :
public thread_group {
322 friend __CG_QUALIFIER__ coalesced_group coalesced_threads();
323 friend __CG_QUALIFIER__ thread_group tiled_partition(
const thread_group& parent,
unsigned int tile_size);
324 friend __CG_QUALIFIER__ coalesced_group tiled_partition(
const coalesced_group& parent,
unsigned int tile_size);
325 friend __CG_QUALIFIER__ coalesced_group binary_partition(
const coalesced_group& cgrp,
bool pred);
326 template <
unsigned int fsize,
class fparent>
327 friend __CG_QUALIFIER__ coalesced_group
328 binary_partition(
const thread_block_tile<fsize, fparent>& tgrp,
bool pred);
330 __CG_QUALIFIER__ coalesced_group new_tiled_group(
unsigned int tile_size)
const {
331 const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
333 if (!tile_size || !pow2) {
334 return coalesced_group(0);
339 if (coalesced_info.tiled_info.is_tiled) {
340 unsigned int base_offset = (thread_rank() & (~(tile_size - 1)));
341 unsigned int masklength = min(
static_cast<unsigned int>(size()) - base_offset, tile_size);
342 lane_mask member_mask =
static_cast<lane_mask
>(-1) >> (__AMDGCN_WAVEFRONT_SIZE - masklength);
344 member_mask <<= (__lane_id() & ~(tile_size - 1));
345 coalesced_group coalesced_tile = coalesced_group(member_mask);
346 coalesced_tile.coalesced_info.tiled_info.is_tiled =
true;
347 coalesced_tile.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size;
348 coalesced_tile.coalesced_info.tiled_info.meta_group_size = size() / tile_size;
349 return coalesced_tile;
353 lane_mask member_mask = 0;
354 unsigned int tile_rank = 0;
355 int lanes_to_skip = ((thread_rank()) / tile_size) * tile_size;
357 for (
unsigned int i = 0; i < __AMDGCN_WAVEFRONT_SIZE; i++) {
358 lane_mask active = coalesced_info.member_mask & (1 << i);
361 if (lanes_to_skip <= 0 && tile_rank < tile_size) {
363 member_mask |= active;
369 coalesced_group coalesced_tile = coalesced_group(member_mask);
370 coalesced_tile.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size;
371 coalesced_tile.coalesced_info.tiled_info.meta_group_size =
372 (size() + tile_size - 1) / tile_size;
373 return coalesced_tile;
375 return coalesced_group(0);
380 explicit __CG_QUALIFIER__ coalesced_group(lane_mask member_mask)
381 : thread_group(internal::cg_coalesced_group) {
382 coalesced_info.member_mask = member_mask;
383 coalesced_info.size = __popcll(coalesced_info.member_mask);
384 coalesced_info.tiled_info.is_tiled =
false;
385 coalesced_info.tiled_info.meta_group_rank = 0;
386 coalesced_info.tiled_info.meta_group_size = 1;
390 __CG_QUALIFIER__
unsigned int size()
const {
391 return coalesced_info.size;
394 __CG_QUALIFIER__
unsigned int thread_rank()
const {
395 return internal::coalesced_group::masked_bit_count(coalesced_info.member_mask);
398 __CG_QUALIFIER__
void sync()
const {
399 internal::coalesced_group::sync();
402 __CG_QUALIFIER__
unsigned int meta_group_rank()
const {
403 return coalesced_info.tiled_info.meta_group_rank;
406 __CG_QUALIFIER__
unsigned int meta_group_size()
const {
407 return coalesced_info.tiled_info.meta_group_size;
411 __CG_QUALIFIER__ T shfl(T var,
int srcRank)
const {
412 static_assert(is_valid_type<T>::value,
"Neither an integer or float type.");
414 srcRank = srcRank %
static_cast<int>(size());
416 int lane = (size() == __AMDGCN_WAVEFRONT_SIZE) ? srcRank
417 : (__AMDGCN_WAVEFRONT_SIZE == 64) ? __fns64(coalesced_info.member_mask, 0, (srcRank + 1))
418 : __fns32(coalesced_info.member_mask, 0, (srcRank + 1));
420 return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE);
424 __CG_QUALIFIER__ T shfl_down(T var,
unsigned int lane_delta)
const {
425 static_assert(is_valid_type<T>::value,
"Neither an integer or float type.");
431 if (size() == __AMDGCN_WAVEFRONT_SIZE) {
432 return __shfl_down(var, lane_delta, __AMDGCN_WAVEFRONT_SIZE);
436 if (__AMDGCN_WAVEFRONT_SIZE == 64) {
437 lane = __fns64(coalesced_info.member_mask, __lane_id(), lane_delta + 1);
440 lane = __fns32(coalesced_info.member_mask, __lane_id(), lane_delta + 1);
447 return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE);
451 __CG_QUALIFIER__ T shfl_up(T var,
unsigned int lane_delta)
const {
452 static_assert(is_valid_type<T>::value,
"Neither an integer or float type.");
458 if (size() == __AMDGCN_WAVEFRONT_SIZE) {
459 return __shfl_up(var, lane_delta, __AMDGCN_WAVEFRONT_SIZE);
463 if (__AMDGCN_WAVEFRONT_SIZE == 64) {
464 lane = __fns64(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1));
466 else if (__AMDGCN_WAVEFRONT_SIZE == 32) {
467 lane = __fns32(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1));
474 return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE);
476#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
477 __CG_QUALIFIER__
unsigned long long ballot(
int pred)
const {
478 return internal::helper::adjust_mask(
479 coalesced_info.member_mask,
480 __ballot_sync<unsigned long long>(coalesced_info.member_mask, pred));
483 __CG_QUALIFIER__
int any(
int pred)
const {
484 return __any_sync(
static_cast<unsigned long long>(coalesced_info.member_mask), pred);
487 __CG_QUALIFIER__
int all(
int pred)
const {
488 return __all_sync(
static_cast<unsigned long long>(coalesced_info.member_mask), pred);
491 template <
typename T> __CG_QUALIFIER__
unsigned long long match_any(T value)
const {
492 return internal::helper::adjust_mask(
493 coalesced_info.member_mask,
494 __match_any_sync(
static_cast<unsigned long long>(coalesced_info.member_mask), value));
497 template <
typename T> __CG_QUALIFIER__
unsigned long long match_all(T value,
int& pred)
const {
498 return internal::helper::adjust_mask(
499 coalesced_info.member_mask,
500 __match_all_sync(
static_cast<unsigned long long>(coalesced_info.member_mask), value,
513__CG_QUALIFIER__ coalesced_group coalesced_threads() {
514 return cooperative_groups::coalesced_group(__builtin_amdgcn_read_exec());
522__CG_QUALIFIER__ uint32_t thread_group::thread_rank()
const {
523 switch (this->_type) {
524 case internal::cg_multi_grid: {
525 return (
static_cast<const multi_grid_group*
>(
this)->thread_rank());
527 case internal::cg_grid: {
528 return (
static_cast<const grid_group*
>(
this)->thread_rank());
530 case internal::cg_workgroup: {
531 return (
static_cast<const thread_block*
>(
this)->thread_rank());
533 case internal::cg_tiled_group: {
534 return (
static_cast<const tiled_group*
>(
this)->thread_rank());
536 case internal::cg_coalesced_group: {
537 return (
static_cast<const coalesced_group*
>(
this)->thread_rank());
540 __hip_assert(
false &&
"invalid cooperative group type");
550__CG_QUALIFIER__
bool thread_group::is_valid()
const {
551 switch (this->_type) {
552 case internal::cg_multi_grid: {
553 return (
static_cast<const multi_grid_group*
>(
this)->is_valid());
555 case internal::cg_grid: {
556 return (
static_cast<const grid_group*
>(
this)->is_valid());
558 case internal::cg_workgroup: {
559 return (
static_cast<const thread_block*
>(
this)->is_valid());
561 case internal::cg_tiled_group: {
562 return (
static_cast<const tiled_group*
>(
this)->is_valid());
564 case internal::cg_coalesced_group: {
565 return (
static_cast<const coalesced_group*
>(
this)->is_valid());
568 __hip_assert(
false &&
"invalid cooperative group type");
578__CG_QUALIFIER__
void thread_group::sync()
const {
579 switch (this->_type) {
580 case internal::cg_multi_grid: {
581 static_cast<const multi_grid_group*
>(
this)->sync();
584 case internal::cg_grid: {
585 static_cast<const grid_group*
>(
this)->sync();
588 case internal::cg_workgroup: {
589 static_cast<const thread_block*
>(
this)->sync();
592 case internal::cg_tiled_group: {
593 static_cast<const tiled_group*
>(
this)->sync();
596 case internal::cg_coalesced_group: {
597 static_cast<const coalesced_group*
>(
this)->sync();
601 __hip_assert(
false &&
"invalid cooperative group type");
612template <
class CGTy> __CG_QUALIFIER__ uint32_t group_size(CGTy
const& g) {
return g.size(); }
619template <
class CGTy> __CG_QUALIFIER__ uint32_t thread_rank(CGTy
const& g) {
620 return g.thread_rank();
628template <
class CGTy> __CG_QUALIFIER__
bool is_valid(CGTy
const& g) {
return g.is_valid(); }
635template <
class CGTy> __CG_QUALIFIER__
void sync(CGTy
const& g) { g.sync(); }
641template <
unsigned int tileSize>
class tile_base {
643 _CG_STATIC_CONST_DECL_
unsigned int numThreads = tileSize;
647 _CG_STATIC_CONST_DECL_
unsigned int thread_rank() {
648 return (internal::workgroup::thread_rank() & (numThreads - 1));
652 __CG_STATIC_QUALIFIER__
unsigned int size() {
return numThreads; }
659template <
unsigned int size>
class thread_block_tile_base :
public tile_base<size> {
660 static_assert(is_valid_tile_size<size>::value,
661 "Tile size is either not a power of 2 or greater than the wavefront size");
662 using tile_base<size>::numThreads;
664 template <
unsigned int fsize,
class fparent>
665 friend __CG_QUALIFIER__ coalesced_group
666 binary_partition(
const thread_block_tile<fsize, fparent>& tgrp,
bool pred);
668#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
669 __CG_QUALIFIER__
unsigned long long build_mask()
const {
670 unsigned long long mask = ~0ull >> (64 - numThreads);
671 return mask << ((internal::workgroup::thread_rank() / numThreads) * numThreads);
676 __CG_STATIC_QUALIFIER__
void sync() {
677 internal::tiled_group::sync();
680 template <
class T> __CG_QUALIFIER__ T shfl(T var,
int srcRank)
const {
681 static_assert(is_valid_type<T>::value,
"Neither an integer or float type.");
682 return (__shfl(var, srcRank, numThreads));
685 template <
class T> __CG_QUALIFIER__ T shfl_down(T var,
unsigned int lane_delta)
const {
686 static_assert(is_valid_type<T>::value,
"Neither an integer or float type.");
687 return (__shfl_down(var, lane_delta, numThreads));
690 template <
class T> __CG_QUALIFIER__ T shfl_up(T var,
unsigned int lane_delta)
const {
691 static_assert(is_valid_type<T>::value,
"Neither an integer or float type.");
692 return (__shfl_up(var, lane_delta, numThreads));
695 template <
class T> __CG_QUALIFIER__ T shfl_xor(T var,
unsigned int laneMask)
const {
696 static_assert(is_valid_type<T>::value,
"Neither an integer or float type.");
697 return (__shfl_xor(var, laneMask, numThreads));
700#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
701 __CG_QUALIFIER__
unsigned long long ballot(
int pred)
const {
702 const auto mask = build_mask();
703 return internal::helper::adjust_mask(mask, __ballot_sync(mask, pred));
706 __CG_QUALIFIER__
int any(
int pred)
const {
return __any_sync(build_mask(), pred); }
708 __CG_QUALIFIER__
int all(
int pred)
const {
return __all_sync(build_mask(), pred); }
710 template <
typename T> __CG_QUALIFIER__
unsigned long long match_any(T value)
const {
711 const auto mask = build_mask();
712 return internal::helper::adjust_mask(mask, __match_any_sync(mask, value));
715 template <
typename T> __CG_QUALIFIER__
unsigned long long match_all(T value,
int& pred)
const {
716 const auto mask = build_mask();
717 return internal::helper::adjust_mask(mask, __match_all_sync(mask, value, &pred));
724template <
unsigned int tileSize,
typename ParentCGTy>
725class parent_group_info {
729 __CG_STATIC_QUALIFIER__
unsigned int meta_group_rank() {
730 return ParentCGTy::thread_rank() / tileSize;
734 __CG_STATIC_QUALIFIER__
unsigned int meta_group_size() {
735 return (ParentCGTy::size() + tileSize - 1) / tileSize;
745template <
unsigned int tileSize,
class ParentCGTy>
746class thread_block_tile_type :
public thread_block_tile_base<tileSize>,
748 public parent_group_info<tileSize, ParentCGTy> {
749 _CG_STATIC_CONST_DECL_
unsigned int numThreads = tileSize;
750 typedef thread_block_tile_base<numThreads> tbtBase;
752 __CG_QUALIFIER__ thread_block_tile_type() : tiled_group(numThreads) {
753 coalesced_info.tiled_info.size = numThreads;
754 coalesced_info.tiled_info.is_tiled =
true;
759 using tbtBase::thread_rank;
763template <
unsigned int tileSize>
764class thread_block_tile_type<tileSize, void> :
public thread_block_tile_base<tileSize>,
767 _CG_STATIC_CONST_DECL_
unsigned int numThreads = tileSize;
769 typedef thread_block_tile_base<numThreads> tbtBase;
773 __CG_QUALIFIER__ thread_block_tile_type(
unsigned int meta_group_rank,
unsigned int meta_group_size)
774 : tiled_group(numThreads) {
775 coalesced_info.tiled_info.size = numThreads;
776 coalesced_info.tiled_info.is_tiled =
true;
777 coalesced_info.tiled_info.meta_group_rank = meta_group_rank;
778 coalesced_info.tiled_info.meta_group_size = meta_group_size;
784 using tbtBase::thread_rank;
786 __CG_QUALIFIER__
unsigned int meta_group_rank()
const {
787 return coalesced_info.tiled_info.meta_group_rank;
790 __CG_QUALIFIER__
unsigned int meta_group_size()
const {
791 return coalesced_info.tiled_info.meta_group_size;
799__CG_QUALIFIER__ thread_group this_thread() {
800 thread_group g(internal::group_type::cg_coalesced_group, 1, __ockl_activelane_u32());
810__CG_QUALIFIER__ thread_group tiled_partition(
const thread_group& parent,
unsigned int tile_size) {
811 if (parent.cg_type() == internal::cg_tiled_group) {
812 const tiled_group* cg =
static_cast<const tiled_group*
>(&parent);
813 return cg->new_tiled_group(tile_size);
815 else if(parent.cg_type() == internal::cg_coalesced_group) {
816 const coalesced_group* cg =
static_cast<const coalesced_group*
>(&parent);
817 return cg->new_tiled_group(tile_size);
820 const thread_block* tb =
static_cast<const thread_block*
>(&parent);
821 return tb->new_tiled_group(tile_size);
826__CG_QUALIFIER__ thread_group tiled_partition(
const thread_block& parent,
unsigned int tile_size) {
827 return (parent.new_tiled_group(tile_size));
830__CG_QUALIFIER__ tiled_group tiled_partition(
const tiled_group& parent,
unsigned int tile_size) {
831 return (parent.new_tiled_group(tile_size));
835__CG_QUALIFIER__ coalesced_group tiled_partition(
const coalesced_group& parent,
unsigned int tile_size) {
836 return (parent.new_tiled_group(tile_size));
840template <
unsigned int size,
class ParentCGTy>
class thread_block_tile_internal;
842template <
unsigned int size,
class ParentCGTy>
843class thread_block_tile_internal :
public thread_block_tile_type<size, ParentCGTy> {
845 template <
unsigned int tbtSize,
class tbtParentT>
846 __CG_QUALIFIER__ thread_block_tile_internal(
847 const thread_block_tile_internal<tbtSize, tbtParentT>& g)
848 : thread_block_tile_type<size, ParentCGTy>(g.meta_group_rank(), g.meta_group_size()) {}
850 __CG_QUALIFIER__ thread_block_tile_internal(
const thread_block& g)
851 : thread_block_tile_type<size, ParentCGTy>() {}
855template <
unsigned int size,
class ParentCGTy>
856class thread_block_tile :
public impl::thread_block_tile_internal<size, ParentCGTy> {
858 __CG_QUALIFIER__ thread_block_tile(
const ParentCGTy& g)
859 : impl::thread_block_tile_internal<size, ParentCGTy>(g) {}
862 __CG_QUALIFIER__
operator thread_block_tile<size, void>()
const {
863 return thread_block_tile<size, void>(*
this);
868template <
unsigned int size>
869class thread_block_tile<size, void> :
public impl::thread_block_tile_internal<size, void> {
870 template <
unsigned int,
class ParentCGTy>
friend class thread_block_tile;
874 template <
class ParentCGTy>
875 __CG_QUALIFIER__ thread_block_tile(
const thread_block_tile<size, ParentCGTy>& g)
876 : impl::thread_block_tile_internal<size, void>(g) {}
879template <
unsigned int size,
class ParentCGTy =
void>
class thread_block_tile;
882template <
unsigned int size,
class ParentCGTy>
struct tiled_partition_internal;
884template <
unsigned int size>
885struct tiled_partition_internal<size, thread_block> :
public thread_block_tile<size, thread_block> {
886 __CG_QUALIFIER__ tiled_partition_internal(
const thread_block& g)
887 : thread_block_tile<size, thread_block>(g) {}
897template <
unsigned int size,
class ParentCGTy>
898__CG_QUALIFIER__ thread_block_tile<size, ParentCGTy> tiled_partition(
const ParentCGTy& g) {
899 static_assert(is_valid_tile_size<size>::value,
900 "Tiled partition with size > wavefront size. Currently not supported ");
901 return impl::tiled_partition_internal<size, ParentCGTy>(g);
904#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
909__CG_QUALIFIER__ coalesced_group binary_partition(
const coalesced_group& cgrp,
bool pred) {
910 auto mask = __ballot_sync<unsigned long long>(cgrp.coalesced_info.member_mask, pred);
913 return coalesced_group(mask);
915 return coalesced_group(cgrp.coalesced_info.member_mask ^ mask);
919template <
unsigned int size,
class parent>
920__CG_QUALIFIER__ coalesced_group binary_partition(
const thread_block_tile<size, parent>& tgrp,
922 auto mask = __ballot_sync<unsigned long long>(tgrp.build_mask(), pred);
925 return coalesced_group(mask);
927 return coalesced_group(tgrp.build_mask() ^ mask);
Device side implementation of cooperative group feature.