HIP: Heterogenous-computing Interface for Portability
Loading...
Searching...
No Matches
amd_hip_cooperative_groups.h
1/*
2Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
3
4Permission is hereby granted, free of charge, to any person obtaining a copy
5of this software and associated documentation files (the "Software"), to deal
6in the Software without restriction, including without limitation the rights
7to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8copies of the Software, and to permit persons to whom the Software is
9furnished to do so, subject to the following conditions:
10
11The above copyright notice and this permission notice shall be included in
12all copies or substantial portions of the Software.
13
14THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20THE SOFTWARE.
21*/
22
32#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
33#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
34
35#if __cplusplus
36#if !defined(__HIPCC_RTC__)
38#endif
39
40namespace cooperative_groups {
41
50class thread_group {
51 protected:
52 uint32_t _type; // thread_group type
53 uint32_t _size; // total number of threads in the tread_group
54 uint64_t _mask; // Lanemask for coalesced and tiled partitioned group types,
55 // LSB represents lane 0, and MSB represents lane 63
56
57 // Construct a thread group, and set thread group type and other essential
58 // thread group properties. This generic thread group is directly constructed
59 // only when the group is supposed to contain only the calling the thread
60 // (throurh the API - `this_thread()`), and in all other cases, this thread
61 // group object is a sub-object of some other derived thread group object
62 __CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t size = static_cast<uint64_t>(0),
63 uint64_t mask = static_cast<uint64_t>(0)) {
64 _type = type;
65 _size = size;
66 _mask = mask;
67 }
68
69 struct _tiled_info {
70 bool is_tiled;
71 unsigned int size;
72 unsigned int meta_group_rank;
73 unsigned int meta_group_size;
74 };
75
76 struct _coalesced_info {
77 lane_mask member_mask;
78 unsigned int size;
79 struct _tiled_info tiled_info;
80 } coalesced_info;
81
82 friend __CG_QUALIFIER__ thread_group this_thread();
83 friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
84 unsigned int tile_size);
85 friend class thread_block;
86
87 public:
88 // Total number of threads in the thread group, and this serves the purpose
89 // for all derived cooperative group types since their `size` is directly
90 // saved during the construction
91 __CG_QUALIFIER__ uint32_t size() const { return _size; }
92 __CG_QUALIFIER__ unsigned int cg_type() const { return _type; }
93 // Rank of the calling thread within [0, size())
94 __CG_QUALIFIER__ uint32_t thread_rank() const;
95 // Is this cooperative group type valid?
96 __CG_QUALIFIER__ bool is_valid() const;
97 // synchronize the threads in the thread group
98 __CG_QUALIFIER__ void sync() const;
99};
123class multi_grid_group : public thread_group {
124 // Only these friend functions are allowed to construct an object of this class
125 // and access its resources
126 friend __CG_QUALIFIER__ multi_grid_group this_multi_grid();
127
128 protected:
129 // Construct mutli-grid thread group (through the API this_multi_grid())
130 explicit __CG_QUALIFIER__ multi_grid_group(uint32_t size)
131 : thread_group(internal::cg_multi_grid, size) {}
132
133 public:
134 // Number of invocations participating in this multi-grid group. In other
135 // words, the number of GPUs
136 __CG_QUALIFIER__ uint32_t num_grids() { return internal::multi_grid::num_grids(); }
137 // Rank of this invocation. In other words, an ID number within the range
138 // [0, num_grids()) of the GPU, this kernel is running on
139 __CG_QUALIFIER__ uint32_t grid_rank() { return internal::multi_grid::grid_rank(); }
140 __CG_QUALIFIER__ uint32_t thread_rank() const { return internal::multi_grid::thread_rank(); }
141 __CG_QUALIFIER__ bool is_valid() const { return internal::multi_grid::is_valid(); }
142 __CG_QUALIFIER__ void sync() const { internal::multi_grid::sync(); }
143};
144
154__CG_QUALIFIER__ multi_grid_group this_multi_grid() {
155 return multi_grid_group(internal::multi_grid::size());
156}
157
166class grid_group : public thread_group {
167 // Only these friend functions are allowed to construct an object of this class
168 // and access its resources
169 friend __CG_QUALIFIER__ grid_group this_grid();
170
171 protected:
172 // Construct grid thread group (through the API this_grid())
173 explicit __CG_QUALIFIER__ grid_group(uint32_t size) : thread_group(internal::cg_grid, size) {}
174
175 public:
176 __CG_QUALIFIER__ uint32_t thread_rank() const { return internal::grid::thread_rank(); }
177 __CG_QUALIFIER__ bool is_valid() const { return internal::grid::is_valid(); }
178 __CG_QUALIFIER__ void sync() const { internal::grid::sync(); }
179 __CG_QUALIFIER__ dim3 group_dim() const { return internal::workgroup::block_dim(); }
180};
181
191__CG_QUALIFIER__ grid_group this_grid() { return grid_group(internal::grid::size()); }
192
202class thread_block : public thread_group {
203 // Only these friend functions are allowed to construct an object of thi
204 // class and access its resources
205 friend __CG_QUALIFIER__ thread_block this_thread_block();
206 friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
207 unsigned int tile_size);
208 friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_block& parent,
209 unsigned int tile_size);
210 protected:
211 // Construct a workgroup thread group (through the API this_thread_block())
212 explicit __CG_QUALIFIER__ thread_block(uint32_t size)
213 : thread_group(internal::cg_workgroup, size) {}
214
215 __CG_QUALIFIER__ thread_group new_tiled_group(unsigned int tile_size) const {
216 const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
217 // Invalid tile size, assert
218 if (!tile_size || (tile_size > __AMDGCN_WAVEFRONT_SIZE) || !pow2) {
219 __hip_assert(false && "invalid tile size");
220 }
221
222 auto block_size = size();
223 auto rank = thread_rank();
224 auto partitions = (block_size + tile_size - 1) / tile_size;
225 auto tail = (partitions * tile_size) - block_size;
226 auto partition_size = tile_size - tail * (rank >= (partitions - 1) * tile_size);
227 thread_group tiledGroup = thread_group(internal::cg_tiled_group, partition_size);
228
229 tiledGroup.coalesced_info.tiled_info.size = tile_size;
230 tiledGroup.coalesced_info.tiled_info.is_tiled = true;
231 tiledGroup.coalesced_info.tiled_info.meta_group_rank = rank / tile_size;
232 tiledGroup.coalesced_info.tiled_info.meta_group_size = partitions;
233 return tiledGroup;
234 }
235
236 public:
237 // 3-dimensional block index within the grid
238 __CG_STATIC_QUALIFIER__ dim3 group_index() { return internal::workgroup::group_index(); }
239 // 3-dimensional thread index within the block
240 __CG_STATIC_QUALIFIER__ dim3 thread_index() { return internal::workgroup::thread_index(); }
241 __CG_STATIC_QUALIFIER__ uint32_t thread_rank() { return internal::workgroup::thread_rank(); }
242 __CG_STATIC_QUALIFIER__ uint32_t size() { return internal::workgroup::size(); }
243 __CG_STATIC_QUALIFIER__ bool is_valid() { return internal::workgroup::is_valid(); }
244 __CG_STATIC_QUALIFIER__ void sync() { internal::workgroup::sync(); }
245 __CG_QUALIFIER__ dim3 group_dim() { return internal::workgroup::block_dim(); }
246};
247
257__CG_QUALIFIER__ thread_block this_thread_block() {
258 return thread_block(internal::workgroup::size());
259}
260
269class tiled_group : public thread_group {
270 private:
271 friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
272 unsigned int tile_size);
273 friend __CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group& parent,
274 unsigned int tile_size);
275
276 __CG_QUALIFIER__ tiled_group new_tiled_group(unsigned int tile_size) const {
277 const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
278
279 if (!tile_size || (tile_size > __AMDGCN_WAVEFRONT_SIZE) || !pow2) {
280 __hip_assert(false && "invalid tile size");
281 }
282
283 if (size() <= tile_size) {
284 return *this;
285 }
286
287 tiled_group tiledGroup = tiled_group(tile_size);
288 tiledGroup.coalesced_info.tiled_info.is_tiled = true;
289 return tiledGroup;
290 }
291
292 protected:
293 explicit __CG_QUALIFIER__ tiled_group(unsigned int tileSize)
294 : thread_group(internal::cg_tiled_group, tileSize) {
295 coalesced_info.tiled_info.size = tileSize;
296 coalesced_info.tiled_info.is_tiled = true;
297 }
298
299 public:
300 __CG_QUALIFIER__ unsigned int size() const { return (coalesced_info.tiled_info.size); }
301
302 __CG_QUALIFIER__ unsigned int thread_rank() const {
303 return (internal::workgroup::thread_rank() & (coalesced_info.tiled_info.size - 1));
304 }
305
306 __CG_QUALIFIER__ void sync() const {
307 internal::tiled_group::sync();
308 }
309};
310
311template <unsigned int size, class ParentCGTy> class thread_block_tile;
312
320class coalesced_group : public thread_group {
321 private:
322 friend __CG_QUALIFIER__ coalesced_group coalesced_threads();
323 friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, unsigned int tile_size);
324 friend __CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tile_size);
325 friend __CG_QUALIFIER__ coalesced_group binary_partition(const coalesced_group& cgrp, bool pred);
326 template <unsigned int fsize, class fparent>
327 friend __CG_QUALIFIER__ coalesced_group
328 binary_partition(const thread_block_tile<fsize, fparent>& tgrp, bool pred);
329
330 __CG_QUALIFIER__ coalesced_group new_tiled_group(unsigned int tile_size) const {
331 const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
332
333 if (!tile_size || !pow2) {
334 return coalesced_group(0);
335 }
336
337 // If a tiled group is passed to be partitioned further into a coalesced_group.
338 // prepare a mask for further partitioning it so that it stays coalesced.
339 if (coalesced_info.tiled_info.is_tiled) {
340 unsigned int base_offset = (thread_rank() & (~(tile_size - 1)));
341 unsigned int masklength = min(static_cast<unsigned int>(size()) - base_offset, tile_size);
342 lane_mask member_mask = static_cast<lane_mask>(-1) >> (__AMDGCN_WAVEFRONT_SIZE - masklength);
343
344 member_mask <<= (__lane_id() & ~(tile_size - 1));
345 coalesced_group coalesced_tile = coalesced_group(member_mask);
346 coalesced_tile.coalesced_info.tiled_info.is_tiled = true;
347 coalesced_tile.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size;
348 coalesced_tile.coalesced_info.tiled_info.meta_group_size = size() / tile_size;
349 return coalesced_tile;
350 }
351 // Here the parent coalesced_group is not partitioned.
352 else {
353 lane_mask member_mask = 0;
354 unsigned int tile_rank = 0;
355 int lanes_to_skip = ((thread_rank()) / tile_size) * tile_size;
356
357 for (unsigned int i = 0; i < __AMDGCN_WAVEFRONT_SIZE; i++) {
358 lane_mask active = coalesced_info.member_mask & (1 << i);
359 // Make sure the lane is active
360 if (active) {
361 if (lanes_to_skip <= 0 && tile_rank < tile_size) {
362 // Prepare a member_mask that is appropriate for a tile
363 member_mask |= active;
364 tile_rank++;
365 }
366 lanes_to_skip--;
367 }
368 }
369 coalesced_group coalesced_tile = coalesced_group(member_mask);
370 coalesced_tile.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size;
371 coalesced_tile.coalesced_info.tiled_info.meta_group_size =
372 (size() + tile_size - 1) / tile_size;
373 return coalesced_tile;
374 }
375 return coalesced_group(0);
376 }
377
378 protected:
379 // Constructor
380 explicit __CG_QUALIFIER__ coalesced_group(lane_mask member_mask)
381 : thread_group(internal::cg_coalesced_group) {
382 coalesced_info.member_mask = member_mask; // Which threads are active
383 coalesced_info.size = __popcll(coalesced_info.member_mask); // How many threads are active
384 coalesced_info.tiled_info.is_tiled = false; // Not a partitioned group
385 coalesced_info.tiled_info.meta_group_rank = 0;
386 coalesced_info.tiled_info.meta_group_size = 1;
387 }
388
389 public:
390 __CG_QUALIFIER__ unsigned int size() const {
391 return coalesced_info.size;
392 }
393
394 __CG_QUALIFIER__ unsigned int thread_rank() const {
395 return internal::coalesced_group::masked_bit_count(coalesced_info.member_mask);
396 }
397
398 __CG_QUALIFIER__ void sync() const {
399 internal::coalesced_group::sync();
400 }
401
402 __CG_QUALIFIER__ unsigned int meta_group_rank() const {
403 return coalesced_info.tiled_info.meta_group_rank;
404 }
405
406 __CG_QUALIFIER__ unsigned int meta_group_size() const {
407 return coalesced_info.tiled_info.meta_group_size;
408 }
409
410 template <class T>
411 __CG_QUALIFIER__ T shfl(T var, int srcRank) const {
412 static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
413
414 srcRank = srcRank % static_cast<int>(size());
415
416 int lane = (size() == __AMDGCN_WAVEFRONT_SIZE) ? srcRank
417 : (__AMDGCN_WAVEFRONT_SIZE == 64) ? __fns64(coalesced_info.member_mask, 0, (srcRank + 1))
418 : __fns32(coalesced_info.member_mask, 0, (srcRank + 1));
419
420 return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE);
421 }
422
423 template <class T>
424 __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const {
425 static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
426
427 // Note: The cuda implementation appears to use the remainder of lane_delta
428 // and WARP_SIZE as the shift value rather than lane_delta itself.
429 // This is not described in the documentation and is not done here.
430
431 if (size() == __AMDGCN_WAVEFRONT_SIZE) {
432 return __shfl_down(var, lane_delta, __AMDGCN_WAVEFRONT_SIZE);
433 }
434
435 int lane;
436 if (__AMDGCN_WAVEFRONT_SIZE == 64) {
437 lane = __fns64(coalesced_info.member_mask, __lane_id(), lane_delta + 1);
438 }
439 else {
440 lane = __fns32(coalesced_info.member_mask, __lane_id(), lane_delta + 1);
441 }
442
443 if (lane == -1) {
444 lane = __lane_id();
445 }
446
447 return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE);
448 }
449
450 template <class T>
451 __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const {
452 static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
453
454 // Note: The cuda implementation appears to use the remainder of lane_delta
455 // and WARP_SIZE as the shift value rather than lane_delta itself.
456 // This is not described in the documentation and is not done here.
457
458 if (size() == __AMDGCN_WAVEFRONT_SIZE) {
459 return __shfl_up(var, lane_delta, __AMDGCN_WAVEFRONT_SIZE);
460 }
461
462 int lane;
463 if (__AMDGCN_WAVEFRONT_SIZE == 64) {
464 lane = __fns64(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1));
465 }
466 else if (__AMDGCN_WAVEFRONT_SIZE == 32) {
467 lane = __fns32(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1));
468 }
469
470 if (lane == -1) {
471 lane = __lane_id();
472 }
473
474 return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE);
475 }
476#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
477 __CG_QUALIFIER__ unsigned long long ballot(int pred) const {
478 return internal::helper::adjust_mask(
479 coalesced_info.member_mask,
480 __ballot_sync<unsigned long long>(coalesced_info.member_mask, pred));
481 }
482
483 __CG_QUALIFIER__ int any(int pred) const {
484 return __any_sync(static_cast<unsigned long long>(coalesced_info.member_mask), pred);
485 }
486
487 __CG_QUALIFIER__ int all(int pred) const {
488 return __all_sync(static_cast<unsigned long long>(coalesced_info.member_mask), pred);
489 }
490
491 template <typename T> __CG_QUALIFIER__ unsigned long long match_any(T value) const {
492 return internal::helper::adjust_mask(
493 coalesced_info.member_mask,
494 __match_any_sync(static_cast<unsigned long long>(coalesced_info.member_mask), value));
495 }
496
497 template <typename T> __CG_QUALIFIER__ unsigned long long match_all(T value, int& pred) const {
498 return internal::helper::adjust_mask(
499 coalesced_info.member_mask,
500 __match_all_sync(static_cast<unsigned long long>(coalesced_info.member_mask), value,
501 &pred));
502 }
503#endif
504};
505
513__CG_QUALIFIER__ coalesced_group coalesced_threads() {
514 return cooperative_groups::coalesced_group(__builtin_amdgcn_read_exec());
515}
516
522__CG_QUALIFIER__ uint32_t thread_group::thread_rank() const {
523 switch (this->_type) {
524 case internal::cg_multi_grid: {
525 return (static_cast<const multi_grid_group*>(this)->thread_rank());
526 }
527 case internal::cg_grid: {
528 return (static_cast<const grid_group*>(this)->thread_rank());
529 }
530 case internal::cg_workgroup: {
531 return (static_cast<const thread_block*>(this)->thread_rank());
532 }
533 case internal::cg_tiled_group: {
534 return (static_cast<const tiled_group*>(this)->thread_rank());
535 }
536 case internal::cg_coalesced_group: {
537 return (static_cast<const coalesced_group*>(this)->thread_rank());
538 }
539 default: {
540 __hip_assert(false && "invalid cooperative group type");
541 return -1;
542 }
543 }
544}
550__CG_QUALIFIER__ bool thread_group::is_valid() const {
551 switch (this->_type) {
552 case internal::cg_multi_grid: {
553 return (static_cast<const multi_grid_group*>(this)->is_valid());
554 }
555 case internal::cg_grid: {
556 return (static_cast<const grid_group*>(this)->is_valid());
557 }
558 case internal::cg_workgroup: {
559 return (static_cast<const thread_block*>(this)->is_valid());
560 }
561 case internal::cg_tiled_group: {
562 return (static_cast<const tiled_group*>(this)->is_valid());
563 }
564 case internal::cg_coalesced_group: {
565 return (static_cast<const coalesced_group*>(this)->is_valid());
566 }
567 default: {
568 __hip_assert(false && "invalid cooperative group type");
569 return false;
570 }
571 }
572}
578__CG_QUALIFIER__ void thread_group::sync() const {
579 switch (this->_type) {
580 case internal::cg_multi_grid: {
581 static_cast<const multi_grid_group*>(this)->sync();
582 break;
583 }
584 case internal::cg_grid: {
585 static_cast<const grid_group*>(this)->sync();
586 break;
587 }
588 case internal::cg_workgroup: {
589 static_cast<const thread_block*>(this)->sync();
590 break;
591 }
592 case internal::cg_tiled_group: {
593 static_cast<const tiled_group*>(this)->sync();
594 break;
595 }
596 case internal::cg_coalesced_group: {
597 static_cast<const coalesced_group*>(this)->sync();
598 break;
599 }
600 default: {
601 __hip_assert(false && "invalid cooperative group type");
602 }
603 }
604}
605
612template <class CGTy> __CG_QUALIFIER__ uint32_t group_size(CGTy const& g) { return g.size(); }
619template <class CGTy> __CG_QUALIFIER__ uint32_t thread_rank(CGTy const& g) {
620 return g.thread_rank();
621}
628template <class CGTy> __CG_QUALIFIER__ bool is_valid(CGTy const& g) { return g.is_valid(); }
635template <class CGTy> __CG_QUALIFIER__ void sync(CGTy const& g) { g.sync(); }
641template <unsigned int tileSize> class tile_base {
642 protected:
643 _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
644
645 public:
646 // Rank of the thread within this tile
647 _CG_STATIC_CONST_DECL_ unsigned int thread_rank() {
648 return (internal::workgroup::thread_rank() & (numThreads - 1));
649 }
650
651 // Number of threads within this tile
652 __CG_STATIC_QUALIFIER__ unsigned int size() { return numThreads; }
653};
659template <unsigned int size> class thread_block_tile_base : public tile_base<size> {
660 static_assert(is_valid_tile_size<size>::value,
661 "Tile size is either not a power of 2 or greater than the wavefront size");
662 using tile_base<size>::numThreads;
663
664 template <unsigned int fsize, class fparent>
665 friend __CG_QUALIFIER__ coalesced_group
666 binary_partition(const thread_block_tile<fsize, fparent>& tgrp, bool pred);
667
668#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
669 __CG_QUALIFIER__ unsigned long long build_mask() const {
670 unsigned long long mask = ~0ull >> (64 - numThreads);
671 return mask << ((internal::workgroup::thread_rank() / numThreads) * numThreads);
672 }
673#endif
674
675 public:
676 __CG_STATIC_QUALIFIER__ void sync() {
677 internal::tiled_group::sync();
678 }
679
680 template <class T> __CG_QUALIFIER__ T shfl(T var, int srcRank) const {
681 static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
682 return (__shfl(var, srcRank, numThreads));
683 }
684
685 template <class T> __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const {
686 static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
687 return (__shfl_down(var, lane_delta, numThreads));
688 }
689
690 template <class T> __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const {
691 static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
692 return (__shfl_up(var, lane_delta, numThreads));
693 }
694
695 template <class T> __CG_QUALIFIER__ T shfl_xor(T var, unsigned int laneMask) const {
696 static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
697 return (__shfl_xor(var, laneMask, numThreads));
698 }
699
700#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
701 __CG_QUALIFIER__ unsigned long long ballot(int pred) const {
702 const auto mask = build_mask();
703 return internal::helper::adjust_mask(mask, __ballot_sync(mask, pred));
704 }
705
706 __CG_QUALIFIER__ int any(int pred) const { return __any_sync(build_mask(), pred); }
707
708 __CG_QUALIFIER__ int all(int pred) const { return __all_sync(build_mask(), pred); }
709
710 template <typename T> __CG_QUALIFIER__ unsigned long long match_any(T value) const {
711 const auto mask = build_mask();
712 return internal::helper::adjust_mask(mask, __match_any_sync(mask, value));
713 }
714
715 template <typename T> __CG_QUALIFIER__ unsigned long long match_all(T value, int& pred) const {
716 const auto mask = build_mask();
717 return internal::helper::adjust_mask(mask, __match_all_sync(mask, value, &pred));
718 }
719#endif
720};
721
724template <unsigned int tileSize, typename ParentCGTy>
725class parent_group_info {
726public:
727 // Returns the linear rank of the group within the set of tiles partitioned
728 // from a parent group (bounded by meta_group_size)
729 __CG_STATIC_QUALIFIER__ unsigned int meta_group_rank() {
730 return ParentCGTy::thread_rank() / tileSize;
731 }
732
733 // Returns the number of groups created when the parent group was partitioned.
734 __CG_STATIC_QUALIFIER__ unsigned int meta_group_size() {
735 return (ParentCGTy::size() + tileSize - 1) / tileSize;
736 }
737};
738
745template <unsigned int tileSize, class ParentCGTy>
746class thread_block_tile_type : public thread_block_tile_base<tileSize>,
747 public tiled_group,
748 public parent_group_info<tileSize, ParentCGTy> {
749 _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
750 typedef thread_block_tile_base<numThreads> tbtBase;
751 protected:
752 __CG_QUALIFIER__ thread_block_tile_type() : tiled_group(numThreads) {
753 coalesced_info.tiled_info.size = numThreads;
754 coalesced_info.tiled_info.is_tiled = true;
755 }
756 public:
757 using tbtBase::size;
758 using tbtBase::sync;
759 using tbtBase::thread_rank;
760};
761
762// Partial template specialization
763template <unsigned int tileSize>
764class thread_block_tile_type<tileSize, void> : public thread_block_tile_base<tileSize>,
765 public tiled_group
766 {
767 _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
768
769 typedef thread_block_tile_base<numThreads> tbtBase;
770
771 protected:
772
773 __CG_QUALIFIER__ thread_block_tile_type(unsigned int meta_group_rank, unsigned int meta_group_size)
774 : tiled_group(numThreads) {
775 coalesced_info.tiled_info.size = numThreads;
776 coalesced_info.tiled_info.is_tiled = true;
777 coalesced_info.tiled_info.meta_group_rank = meta_group_rank;
778 coalesced_info.tiled_info.meta_group_size = meta_group_size;
779 }
780
781 public:
782 using tbtBase::size;
783 using tbtBase::sync;
784 using tbtBase::thread_rank;
785
786 __CG_QUALIFIER__ unsigned int meta_group_rank() const {
787 return coalesced_info.tiled_info.meta_group_rank;
788 }
789
790 __CG_QUALIFIER__ unsigned int meta_group_size() const {
791 return coalesced_info.tiled_info.meta_group_size;
792 }
793// end of operative group
797};
798
799__CG_QUALIFIER__ thread_group this_thread() {
800 thread_group g(internal::group_type::cg_coalesced_group, 1, __ockl_activelane_u32());
801 return g;
802}
803
810__CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, unsigned int tile_size) {
811 if (parent.cg_type() == internal::cg_tiled_group) {
812 const tiled_group* cg = static_cast<const tiled_group*>(&parent);
813 return cg->new_tiled_group(tile_size);
814 }
815 else if(parent.cg_type() == internal::cg_coalesced_group) {
816 const coalesced_group* cg = static_cast<const coalesced_group*>(&parent);
817 return cg->new_tiled_group(tile_size);
818 }
819 else {
820 const thread_block* tb = static_cast<const thread_block*>(&parent);
821 return tb->new_tiled_group(tile_size);
822 }
823}
824
825// Thread block type overload
826__CG_QUALIFIER__ thread_group tiled_partition(const thread_block& parent, unsigned int tile_size) {
827 return (parent.new_tiled_group(tile_size));
828}
829
830__CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group& parent, unsigned int tile_size) {
831 return (parent.new_tiled_group(tile_size));
832}
833
834// If a coalesced group is passed to be partitioned, it should remain coalesced
835__CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tile_size) {
836 return (parent.new_tiled_group(tile_size));
837}
838
839namespace impl {
840template <unsigned int size, class ParentCGTy> class thread_block_tile_internal;
841
842template <unsigned int size, class ParentCGTy>
843class thread_block_tile_internal : public thread_block_tile_type<size, ParentCGTy> {
844 protected:
845 template <unsigned int tbtSize, class tbtParentT>
846 __CG_QUALIFIER__ thread_block_tile_internal(
847 const thread_block_tile_internal<tbtSize, tbtParentT>& g)
848 : thread_block_tile_type<size, ParentCGTy>(g.meta_group_rank(), g.meta_group_size()) {}
849
850 __CG_QUALIFIER__ thread_block_tile_internal(const thread_block& g)
851 : thread_block_tile_type<size, ParentCGTy>() {}
852};
853} // namespace impl
854
855template <unsigned int size, class ParentCGTy>
856class thread_block_tile : public impl::thread_block_tile_internal<size, ParentCGTy> {
857 protected:
858 __CG_QUALIFIER__ thread_block_tile(const ParentCGTy& g)
859 : impl::thread_block_tile_internal<size, ParentCGTy>(g) {}
860
861 public:
862 __CG_QUALIFIER__ operator thread_block_tile<size, void>() const {
863 return thread_block_tile<size, void>(*this);
864 }
865};
866
867
868template <unsigned int size>
869class thread_block_tile<size, void> : public impl::thread_block_tile_internal<size, void> {
870 template <unsigned int, class ParentCGTy> friend class thread_block_tile;
871
872 protected:
873 public:
874 template <class ParentCGTy>
875 __CG_QUALIFIER__ thread_block_tile(const thread_block_tile<size, ParentCGTy>& g)
876 : impl::thread_block_tile_internal<size, void>(g) {}
877};
878
879template <unsigned int size, class ParentCGTy = void> class thread_block_tile;
880
881namespace impl {
882template <unsigned int size, class ParentCGTy> struct tiled_partition_internal;
883
884template <unsigned int size>
885struct tiled_partition_internal<size, thread_block> : public thread_block_tile<size, thread_block> {
886 __CG_QUALIFIER__ tiled_partition_internal(const thread_block& g)
887 : thread_block_tile<size, thread_block>(g) {}
888};
889
890} // namespace impl
891
897template <unsigned int size, class ParentCGTy>
898__CG_QUALIFIER__ thread_block_tile<size, ParentCGTy> tiled_partition(const ParentCGTy& g) {
899 static_assert(is_valid_tile_size<size>::value,
900 "Tiled partition with size > wavefront size. Currently not supported ");
901 return impl::tiled_partition_internal<size, ParentCGTy>(g);
902}
903
904#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
909__CG_QUALIFIER__ coalesced_group binary_partition(const coalesced_group& cgrp, bool pred) {
910 auto mask = __ballot_sync<unsigned long long>(cgrp.coalesced_info.member_mask, pred);
911
912 if (pred) {
913 return coalesced_group(mask);
914 } else {
915 return coalesced_group(cgrp.coalesced_info.member_mask ^ mask);
916 }
917}
918
919template <unsigned int size, class parent>
920__CG_QUALIFIER__ coalesced_group binary_partition(const thread_block_tile<size, parent>& tgrp,
921 bool pred) {
922 auto mask = __ballot_sync<unsigned long long>(tgrp.build_mask(), pred);
923
924 if (pred) {
925 return coalesced_group(mask);
926 } else {
927 return coalesced_group(tgrp.build_mask() ^ mask);
928 }
929}
930#endif
931} // namespace cooperative_groups
932
933#endif // __cplusplus
934#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
Device side implementation of cooperative group feature.