23#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H
24#define HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H
26__device__
static inline unsigned __hip_ds_bpermute(
int index,
unsigned src) {
27 union {
int i;
unsigned u;
float f; } tmp; tmp.u = src;
28 tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
32__device__
static inline float __hip_ds_bpermutef(
int index,
float src) {
33 union {
int i;
unsigned u;
float f; } tmp; tmp.f = src;
34 tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
38__device__
static inline unsigned __hip_ds_permute(
int index,
unsigned src) {
39 union {
int i;
unsigned u;
float f; } tmp; tmp.u = src;
40 tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
44__device__
static inline float __hip_ds_permutef(
int index,
float src) {
45 union {
int i;
unsigned u;
float f; } tmp; tmp.f = src;
46 tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
50#define __hip_ds_swizzle(src, pattern) __hip_ds_swizzle_N<(pattern)>((src))
51#define __hip_ds_swizzlef(src, pattern) __hip_ds_swizzlef_N<(pattern)>((src))
54__device__
static inline unsigned __hip_ds_swizzle_N(
unsigned int src) {
55 union {
int i;
unsigned u;
float f; } tmp; tmp.u = src;
56 tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
61__device__
static inline float __hip_ds_swizzlef_N(
float src) {
62 union {
int i;
unsigned u;
float f; } tmp; tmp.f = src;
63 tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
67#define __hip_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl) \
68 __hip_move_dpp_N<(dpp_ctrl), (row_mask), (bank_mask), (bound_ctrl)>((src))
70template <
int dpp_ctrl,
int row_mask,
int bank_mask,
bool bound_ctrl>
71__device__
static inline int __hip_move_dpp_N(
int src) {
72 return __builtin_amdgcn_mov_dpp(src, dpp_ctrl, row_mask, bank_mask,
76static constexpr int warpSize = __AMDGCN_WAVEFRONT_SIZE;
81int __all(
int predicate) {
82 return __ockl_wfall_i32(predicate);
87int __any(
int predicate) {
88 return __ockl_wfany_i32(predicate);
96unsigned long long int __ballot(
int predicate) {
97 return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
102unsigned long long int __ballot64(
int predicate) {
103 return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
107#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
112unsigned long long __activemask() {
113 return __ballot(
true);
117__device__
static inline unsigned int __lane_id() {
118 return __builtin_amdgcn_mbcnt_hi(
119 -1, __builtin_amdgcn_mbcnt_lo(-1, 0));
124int __shfl(
int var,
int src_lane,
int width = warpSize) {
125 int self = __lane_id();
126 int index = (src_lane & (width - 1)) + (self & ~(width-1));
127 return __builtin_amdgcn_ds_bpermute(index<<2, var);
131unsigned int __shfl(
unsigned int var,
int src_lane,
int width = warpSize) {
132 union {
int i;
unsigned u;
float f; } tmp; tmp.u = var;
133 tmp.i = __shfl(tmp.i, src_lane, width);
138float __shfl(
float var,
int src_lane,
int width = warpSize) {
139 union {
int i;
unsigned u;
float f; } tmp; tmp.f = var;
140 tmp.i = __shfl(tmp.i, src_lane, width);
145double __shfl(
double var,
int src_lane,
int width = warpSize) {
146 static_assert(
sizeof(double) == 2 *
sizeof(int),
"");
147 static_assert(
sizeof(double) ==
sizeof(uint64_t),
"");
149 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
150 tmp[0] = __shfl(tmp[0], src_lane, width);
151 tmp[1] = __shfl(tmp[1], src_lane, width);
153 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
154 double tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
159long __shfl(
long var,
int src_lane,
int width = warpSize)
162 static_assert(
sizeof(long) == 2 *
sizeof(int),
"");
163 static_assert(
sizeof(long) ==
sizeof(uint64_t),
"");
165 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
166 tmp[0] = __shfl(tmp[0], src_lane, width);
167 tmp[1] = __shfl(tmp[1], src_lane, width);
169 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
170 long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
173 static_assert(
sizeof(long) ==
sizeof(int),
"");
174 return static_cast<long>(__shfl(
static_cast<int>(var), src_lane, width));
179unsigned long __shfl(
unsigned long var,
int src_lane,
int width = warpSize) {
181 static_assert(
sizeof(
unsigned long) == 2 *
sizeof(
unsigned int),
"");
182 static_assert(
sizeof(
unsigned long) ==
sizeof(uint64_t),
"");
184 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
185 tmp[0] = __shfl(tmp[0], src_lane, width);
186 tmp[1] = __shfl(tmp[1], src_lane, width);
188 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
189 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
192 static_assert(
sizeof(
unsigned long) ==
sizeof(
unsigned int),
"");
193 return static_cast<unsigned long>(__shfl(
static_cast<unsigned int>(var), src_lane, width));
198long long __shfl(
long long var,
int src_lane,
int width = warpSize)
200 static_assert(
sizeof(
long long) == 2 *
sizeof(int),
"");
201 static_assert(
sizeof(
long long) ==
sizeof(uint64_t),
"");
203 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
204 tmp[0] = __shfl(tmp[0], src_lane, width);
205 tmp[1] = __shfl(tmp[1], src_lane, width);
207 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
208 long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
213unsigned long long __shfl(
unsigned long long var,
int src_lane,
int width = warpSize) {
214 static_assert(
sizeof(
unsigned long long) == 2 *
sizeof(
unsigned int),
"");
215 static_assert(
sizeof(
unsigned long long) ==
sizeof(uint64_t),
"");
217 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
218 tmp[0] = __shfl(tmp[0], src_lane, width);
219 tmp[1] = __shfl(tmp[1], src_lane, width);
221 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
222 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
228int __shfl_up(
int var,
unsigned int lane_delta,
int width = warpSize) {
229 int self = __lane_id();
230 int index = self - lane_delta;
231 index = (index < (self & ~(width-1)))?self:index;
232 return __builtin_amdgcn_ds_bpermute(index<<2, var);
236unsigned int __shfl_up(
unsigned int var,
unsigned int lane_delta,
int width = warpSize) {
237 union {
int i;
unsigned u;
float f; } tmp; tmp.u = var;
238 tmp.i = __shfl_up(tmp.i, lane_delta, width);
243float __shfl_up(
float var,
unsigned int lane_delta,
int width = warpSize) {
244 union {
int i;
unsigned u;
float f; } tmp; tmp.f = var;
245 tmp.i = __shfl_up(tmp.i, lane_delta, width);
250double __shfl_up(
double var,
unsigned int lane_delta,
int width = warpSize) {
251 static_assert(
sizeof(double) == 2 *
sizeof(int),
"");
252 static_assert(
sizeof(double) ==
sizeof(uint64_t),
"");
254 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
255 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
256 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
258 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
259 double tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
264long __shfl_up(
long var,
unsigned int lane_delta,
int width = warpSize)
267 static_assert(
sizeof(long) == 2 *
sizeof(int),
"");
268 static_assert(
sizeof(long) ==
sizeof(uint64_t),
"");
270 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
271 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
272 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
274 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
275 long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
278 static_assert(
sizeof(long) ==
sizeof(int),
"");
279 return static_cast<long>(__shfl_up(
static_cast<int>(var), lane_delta, width));
285unsigned long __shfl_up(
unsigned long var,
unsigned int lane_delta,
int width = warpSize)
288 static_assert(
sizeof(
unsigned long) == 2 *
sizeof(
unsigned int),
"");
289 static_assert(
sizeof(
unsigned long) ==
sizeof(uint64_t),
"");
291 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
292 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
293 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
295 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
296 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
299 static_assert(
sizeof(
unsigned long) ==
sizeof(
unsigned int),
"");
300 return static_cast<unsigned long>(__shfl_up(
static_cast<unsigned int>(var), lane_delta, width));
306long long __shfl_up(
long long var,
unsigned int lane_delta,
int width = warpSize)
308 static_assert(
sizeof(
long long) == 2 *
sizeof(int),
"");
309 static_assert(
sizeof(
long long) ==
sizeof(uint64_t),
"");
310 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
311 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
312 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
313 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
314 long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
320unsigned long long __shfl_up(
unsigned long long var,
unsigned int lane_delta,
int width = warpSize)
322 static_assert(
sizeof(
unsigned long long) == 2 *
sizeof(
unsigned int),
"");
323 static_assert(
sizeof(
unsigned long long) ==
sizeof(uint64_t),
"");
324 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
325 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
326 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
327 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
328 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
334int __shfl_down(
int var,
unsigned int lane_delta,
int width = warpSize) {
335 int self = __lane_id();
336 int index = self + lane_delta;
337 index = (int)((self&(width-1))+lane_delta) >= width?self:index;
338 return __builtin_amdgcn_ds_bpermute(index<<2, var);
342unsigned int __shfl_down(
unsigned int var,
unsigned int lane_delta,
int width = warpSize) {
343 union {
int i;
unsigned u;
float f; } tmp; tmp.u = var;
344 tmp.i = __shfl_down(tmp.i, lane_delta, width);
349float __shfl_down(
float var,
unsigned int lane_delta,
int width = warpSize) {
350 union {
int i;
unsigned u;
float f; } tmp; tmp.f = var;
351 tmp.i = __shfl_down(tmp.i, lane_delta, width);
356double __shfl_down(
double var,
unsigned int lane_delta,
int width = warpSize) {
357 static_assert(
sizeof(double) == 2 *
sizeof(int),
"");
358 static_assert(
sizeof(double) ==
sizeof(uint64_t),
"");
360 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
361 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
362 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
364 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
365 double tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
370long __shfl_down(
long var,
unsigned int lane_delta,
int width = warpSize)
373 static_assert(
sizeof(long) == 2 *
sizeof(int),
"");
374 static_assert(
sizeof(long) ==
sizeof(uint64_t),
"");
376 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
377 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
378 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
380 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
381 long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
384 static_assert(
sizeof(long) ==
sizeof(int),
"");
385 return static_cast<long>(__shfl_down(
static_cast<int>(var), lane_delta, width));
390unsigned long __shfl_down(
unsigned long var,
unsigned int lane_delta,
int width = warpSize)
393 static_assert(
sizeof(
unsigned long) == 2 *
sizeof(
unsigned int),
"");
394 static_assert(
sizeof(
unsigned long) ==
sizeof(uint64_t),
"");
396 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
397 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
398 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
400 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
401 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
404 static_assert(
sizeof(
unsigned long) ==
sizeof(
unsigned int),
"");
405 return static_cast<unsigned long>(__shfl_down(
static_cast<unsigned int>(var), lane_delta, width));
410long long __shfl_down(
long long var,
unsigned int lane_delta,
int width = warpSize)
412 static_assert(
sizeof(
long long) == 2 *
sizeof(int),
"");
413 static_assert(
sizeof(
long long) ==
sizeof(uint64_t),
"");
414 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
415 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
416 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
417 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
418 long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
423unsigned long long __shfl_down(
unsigned long long var,
unsigned int lane_delta,
int width = warpSize)
425 static_assert(
sizeof(
unsigned long long) == 2 *
sizeof(
unsigned int),
"");
426 static_assert(
sizeof(
unsigned long long) ==
sizeof(uint64_t),
"");
427 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
428 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
429 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
430 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
431 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
437int __shfl_xor(
int var,
int lane_mask,
int width = warpSize) {
438 int self = __lane_id();
439 int index = self^lane_mask;
440 index = index >= ((self+width)&~(width-1))?self:index;
441 return __builtin_amdgcn_ds_bpermute(index<<2, var);
445unsigned int __shfl_xor(
unsigned int var,
int lane_mask,
int width = warpSize) {
446 union {
int i;
unsigned u;
float f; } tmp; tmp.u = var;
447 tmp.i = __shfl_xor(tmp.i, lane_mask, width);
452float __shfl_xor(
float var,
int lane_mask,
int width = warpSize) {
453 union {
int i;
unsigned u;
float f; } tmp; tmp.f = var;
454 tmp.i = __shfl_xor(tmp.i, lane_mask, width);
459double __shfl_xor(
double var,
int lane_mask,
int width = warpSize) {
460 static_assert(
sizeof(double) == 2 *
sizeof(int),
"");
461 static_assert(
sizeof(double) ==
sizeof(uint64_t),
"");
463 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
464 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
465 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
467 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
468 double tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
473long __shfl_xor(
long var,
int lane_mask,
int width = warpSize)
476 static_assert(
sizeof(long) == 2 *
sizeof(int),
"");
477 static_assert(
sizeof(long) ==
sizeof(uint64_t),
"");
479 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
480 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
481 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
483 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
484 long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
487 static_assert(
sizeof(long) ==
sizeof(int),
"");
488 return static_cast<long>(__shfl_xor(
static_cast<int>(var), lane_mask, width));
493unsigned long __shfl_xor(
unsigned long var,
int lane_mask,
int width = warpSize)
496 static_assert(
sizeof(
unsigned long) == 2 *
sizeof(
unsigned int),
"");
497 static_assert(
sizeof(
unsigned long) ==
sizeof(uint64_t),
"");
499 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
500 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
501 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
503 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
504 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
507 static_assert(
sizeof(
unsigned long) ==
sizeof(
unsigned int),
"");
508 return static_cast<unsigned long>(__shfl_xor(
static_cast<unsigned int>(var), lane_mask, width));
513long long __shfl_xor(
long long var,
int lane_mask,
int width = warpSize)
515 static_assert(
sizeof(
long long) == 2 *
sizeof(int),
"");
516 static_assert(
sizeof(
long long) ==
sizeof(uint64_t),
"");
517 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
518 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
519 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
520 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
521 long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
526unsigned long long __shfl_xor(
unsigned long long var,
int lane_mask,
int width = warpSize)
528 static_assert(
sizeof(
unsigned long long) == 2 *
sizeof(
unsigned int),
"");
529 static_assert(
sizeof(
unsigned long long) ==
sizeof(uint64_t),
"");
530 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
531 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
532 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
533 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
534 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));