32 #ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
34 #if !defined(__HIPCC_RTC__)
35 #include "amd_warp_functions.h"
36 #include "hip_assert.h"
41 T __hip_readfirstlane(T val) {
52 unsigned long long lower = (unsigned)__builtin_amdgcn_readfirstlane(u.l);
53 unsigned long long upper =
54 (unsigned)__builtin_amdgcn_readfirstlane(u.l >> 32);
55 u.l = (upper << 32) | lower;
60 #define __hip_adjust_mask_for_wave32(MASK) \
62 if (warpSize == 32) MASK &= 0xFFFFFFFF; \
90 #define __hip_check_mask(MASK) \
92 __hip_assert(MASK && "mask must be non-zero"); \
94 while (__any(!done)) { \
96 auto chosen_mask = __hip_readfirstlane(MASK); \
97 if (MASK == chosen_mask) { \
98 __hip_assert(MASK == __ballot(true) && \
99 "all threads specified in the mask" \
100 " must execute the same operation with the same mask"); \
107 #define __hip_do_sync(RETVAL, FUNC, MASK, ...) \
109 __hip_assert(MASK && "mask must be non-zero"); \
111 while (__any(!done)) { \
113 auto chosen_mask = __hip_readfirstlane(MASK); \
114 if (MASK == chosen_mask) { \
115 __hip_assert(MASK == __ballot(true) && \
116 "all threads specified in the mask" \
117 " must execute the same operation with the same mask"); \
118 RETVAL = FUNC(__VA_ARGS__); \
127 template <
typename MaskT>
129 unsigned long long __ballot_sync(MaskT mask,
int predicate) {
131 __hip_internal::is_integral<MaskT>::value &&
sizeof(MaskT) == 8,
132 "The mask must be a 64-bit integer. "
133 "Implicitly promoting a smaller integer is almost always an error.");
134 __hip_adjust_mask_for_wave32(mask);
135 __hip_check_mask(mask);
136 return __ballot(predicate) & mask;
139 template <
typename MaskT>
141 int __all_sync(MaskT mask,
int predicate) {
143 __hip_internal::is_integral<MaskT>::value &&
sizeof(MaskT) == 8,
144 "The mask must be a 64-bit integer. "
145 "Implicitly promoting a smaller integer is almost always an error.");
146 __hip_adjust_mask_for_wave32(mask);
147 return __ballot_sync(mask, predicate) == mask;
150 template <
typename MaskT>
152 int __any_sync(MaskT mask,
int predicate) {
154 __hip_internal::is_integral<MaskT>::value &&
sizeof(MaskT) == 8,
155 "The mask must be a 64-bit integer. "
156 "Implicitly promoting a smaller integer is almost always an error.");
157 __hip_adjust_mask_for_wave32(mask);
158 return __ballot_sync(mask, predicate) != 0;
163 template <
typename T>
165 unsigned long long __match_any(T value) {
167 (__hip_internal::is_integral<T>::value || __hip_internal::is_floating_point<T>::value) &&
168 (
sizeof(T) == 4 ||
sizeof(T) == 8),
169 "T can be int, unsigned int, long, unsigned long, long long, unsigned "
170 "long long, float or double.");
172 unsigned long long retval = 0;
174 while (__any(!done)) {
176 T chosen = __hip_readfirstlane(value);
177 if (chosen == value) {
178 retval = __activemask();
187 template <
typename MaskT,
typename T>
189 unsigned long long __match_any_sync(MaskT mask, T value) {
191 __hip_internal::is_integral<MaskT>::value &&
sizeof(MaskT) == 8,
192 "The mask must be a 64-bit integer. "
193 "Implicitly promoting a smaller integer is almost always an error.");
194 __hip_adjust_mask_for_wave32(mask);
195 __hip_check_mask(mask);
196 return __match_any(value) & mask;
199 template <
typename T>
201 unsigned long long __match_all(T value,
int* pred) {
203 (__hip_internal::is_integral<T>::value || __hip_internal::is_floating_point<T>::value) &&
204 (
sizeof(T) == 4 ||
sizeof(T) == 8),
205 "T can be int, unsigned int, long, unsigned long, long long, unsigned "
206 "long long, float or double.");
207 T first = __hip_readfirstlane(value);
208 if (__all(first == value)) {
210 return __activemask();
217 template <
typename MaskT,
typename T>
219 unsigned long long __match_all_sync(MaskT mask, T value,
int* pred) {
221 __hip_internal::is_integral<MaskT>::value &&
sizeof(MaskT) == 8,
222 "The mask must be a 64-bit integer. "
223 "Implicitly promoting a smaller integer is almost always an error.");
225 __hip_adjust_mask_for_wave32(mask);
226 __hip_do_sync(retval, __match_all, mask, value, pred);
232 template <
typename MaskT,
typename T>
234 T __shfl_sync(MaskT mask, T var,
int srcLane,
235 int width = __AMDGCN_WAVEFRONT_SIZE) {
237 __hip_internal::is_integral<MaskT>::value &&
sizeof(MaskT) == 8,
238 "The mask must be a 64-bit integer. "
239 "Implicitly promoting a smaller integer is almost always an error.");
240 __hip_adjust_mask_for_wave32(mask);
241 __hip_check_mask(mask);
242 return __shfl(var, srcLane, width);
245 template <
typename MaskT,
typename T>
247 T __shfl_up_sync(MaskT mask, T var,
unsigned int delta,
248 int width = __AMDGCN_WAVEFRONT_SIZE) {
250 __hip_internal::is_integral<MaskT>::value &&
sizeof(MaskT) == 8,
251 "The mask must be a 64-bit integer. "
252 "Implicitly promoting a smaller integer is almost always an error.");
253 __hip_adjust_mask_for_wave32(mask);
254 __hip_check_mask(mask);
255 return __shfl_up(var, delta, width);
258 template <
typename MaskT,
typename T>
260 T __shfl_down_sync(MaskT mask, T var,
unsigned int delta,
261 int width = __AMDGCN_WAVEFRONT_SIZE) {
263 __hip_internal::is_integral<MaskT>::value &&
sizeof(MaskT) == 8,
264 "The mask must be a 64-bit integer. "
265 "Implicitly promoting a smaller integer is almost always an error.");
266 __hip_adjust_mask_for_wave32(mask);
267 __hip_check_mask(mask);
268 return __shfl_down(var, delta, width);
271 template <
typename MaskT,
typename T>
273 T __shfl_xor_sync(MaskT mask, T var,
int laneMask,
274 int width = __AMDGCN_WAVEFRONT_SIZE) {
276 __hip_internal::is_integral<MaskT>::value &&
sizeof(MaskT) == 8,
277 "The mask must be a 64-bit integer. "
278 "Implicitly promoting a smaller integer is almost always an error.");
279 __hip_adjust_mask_for_wave32(mask);
280 __hip_check_mask(mask);
281 return __shfl_xor(var, laneMask, width);
285 #undef __hip_check_mask
286 #undef __hip_adjust_mask_for_wave32