23#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H
24#define HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H
26__device__
static inline unsigned __hip_ds_bpermute(
int index,
unsigned src) {
27 union {
int i;
unsigned u;
float f; } tmp; tmp.u = src;
28 tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
32__device__
static inline float __hip_ds_bpermutef(
int index,
float src) {
33 union {
int i;
unsigned u;
float f; } tmp; tmp.f = src;
34 tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
38__device__
static inline unsigned __hip_ds_permute(
int index,
unsigned src) {
39 union {
int i;
unsigned u;
float f; } tmp; tmp.u = src;
40 tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
44__device__
static inline float __hip_ds_permutef(
int index,
float src) {
45 union {
int i;
unsigned u;
float f; } tmp; tmp.f = src;
46 tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
50#define __hip_ds_swizzle(src, pattern) __hip_ds_swizzle_N<(pattern)>((src))
51#define __hip_ds_swizzlef(src, pattern) __hip_ds_swizzlef_N<(pattern)>((src))
54__device__
static inline unsigned __hip_ds_swizzle_N(
unsigned int src) {
55 union {
int i;
unsigned u;
float f; } tmp; tmp.u = src;
56 tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
61__device__
static inline float __hip_ds_swizzlef_N(
float src) {
62 union {
int i;
unsigned u;
float f; } tmp; tmp.f = src;
63 tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
67#define __hip_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl) \
68 __hip_move_dpp_N<(dpp_ctrl), (row_mask), (bank_mask), (bound_ctrl)>((src))
70template <
int dpp_ctrl,
int row_mask,
int bank_mask,
bool bound_ctrl>
71__device__
static inline int __hip_move_dpp_N(
int src) {
72 return __builtin_amdgcn_mov_dpp(src, dpp_ctrl, row_mask, bank_mask,
77static constexpr int warpSize = __AMDGCN_WAVEFRONT_SIZE;
82int __all(
int predicate) {
83 return __ockl_wfall_i32(predicate);
88int __any(
int predicate) {
89 return __ockl_wfany_i32(predicate);
97unsigned long long int __ballot(
int predicate) {
98 return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
103unsigned long long int __ballot64(
int predicate) {
104 return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
108#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
113unsigned long long __activemask() {
114 return __ballot(
true);
118__device__
static inline unsigned int __lane_id() {
119 return __builtin_amdgcn_mbcnt_hi(
120 -1, __builtin_amdgcn_mbcnt_lo(-1, 0));
125int __shfl(
int var,
int src_lane,
int width = warpSize) {
126 int self = __lane_id();
127 int index = (src_lane & (width - 1)) + (self & ~(width-1));
128 return __builtin_amdgcn_ds_bpermute(index<<2, var);
132unsigned int __shfl(
unsigned int var,
int src_lane,
int width = warpSize) {
133 union {
int i;
unsigned u;
float f; } tmp; tmp.u = var;
134 tmp.i = __shfl(tmp.i, src_lane, width);
139float __shfl(
float var,
int src_lane,
int width = warpSize) {
140 union {
int i;
unsigned u;
float f; } tmp; tmp.f = var;
141 tmp.i = __shfl(tmp.i, src_lane, width);
146double __shfl(
double var,
int src_lane,
int width = warpSize) {
147 static_assert(
sizeof(double) == 2 *
sizeof(int),
"");
148 static_assert(
sizeof(double) ==
sizeof(uint64_t),
"");
150 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
151 tmp[0] = __shfl(tmp[0], src_lane, width);
152 tmp[1] = __shfl(tmp[1], src_lane, width);
154 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
155 double tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
160long __shfl(
long var,
int src_lane,
int width = warpSize)
163 static_assert(
sizeof(long) == 2 *
sizeof(int),
"");
164 static_assert(
sizeof(long) ==
sizeof(uint64_t),
"");
166 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
167 tmp[0] = __shfl(tmp[0], src_lane, width);
168 tmp[1] = __shfl(tmp[1], src_lane, width);
170 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
171 long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
174 static_assert(
sizeof(long) ==
sizeof(int),
"");
175 return static_cast<long>(__shfl(
static_cast<int>(var), src_lane, width));
180unsigned long __shfl(
unsigned long var,
int src_lane,
int width = warpSize) {
182 static_assert(
sizeof(
unsigned long) == 2 *
sizeof(
unsigned int),
"");
183 static_assert(
sizeof(
unsigned long) ==
sizeof(uint64_t),
"");
185 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
186 tmp[0] = __shfl(tmp[0], src_lane, width);
187 tmp[1] = __shfl(tmp[1], src_lane, width);
189 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
190 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
193 static_assert(
sizeof(
unsigned long) ==
sizeof(
unsigned int),
"");
194 return static_cast<unsigned long>(__shfl(
static_cast<unsigned int>(var), src_lane, width));
199long long __shfl(
long long var,
int src_lane,
int width = warpSize)
201 static_assert(
sizeof(
long long) == 2 *
sizeof(int),
"");
202 static_assert(
sizeof(
long long) ==
sizeof(uint64_t),
"");
204 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
205 tmp[0] = __shfl(tmp[0], src_lane, width);
206 tmp[1] = __shfl(tmp[1], src_lane, width);
208 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
209 long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
214unsigned long long __shfl(
unsigned long long var,
int src_lane,
int width = warpSize) {
215 static_assert(
sizeof(
unsigned long long) == 2 *
sizeof(
unsigned int),
"");
216 static_assert(
sizeof(
unsigned long long) ==
sizeof(uint64_t),
"");
218 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
219 tmp[0] = __shfl(tmp[0], src_lane, width);
220 tmp[1] = __shfl(tmp[1], src_lane, width);
222 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
223 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
229int __shfl_up(
int var,
unsigned int lane_delta,
int width = warpSize) {
230 int self = __lane_id();
231 int index = self - lane_delta;
232 index = (index < (self & ~(width-1)))?self:index;
233 return __builtin_amdgcn_ds_bpermute(index<<2, var);
237unsigned int __shfl_up(
unsigned int var,
unsigned int lane_delta,
int width = warpSize) {
238 union {
int i;
unsigned u;
float f; } tmp; tmp.u = var;
239 tmp.i = __shfl_up(tmp.i, lane_delta, width);
244float __shfl_up(
float var,
unsigned int lane_delta,
int width = warpSize) {
245 union {
int i;
unsigned u;
float f; } tmp; tmp.f = var;
246 tmp.i = __shfl_up(tmp.i, lane_delta, width);
251double __shfl_up(
double var,
unsigned int lane_delta,
int width = warpSize) {
252 static_assert(
sizeof(double) == 2 *
sizeof(int),
"");
253 static_assert(
sizeof(double) ==
sizeof(uint64_t),
"");
255 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
256 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
257 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
259 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
260 double tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
265long __shfl_up(
long var,
unsigned int lane_delta,
int width = warpSize)
268 static_assert(
sizeof(long) == 2 *
sizeof(int),
"");
269 static_assert(
sizeof(long) ==
sizeof(uint64_t),
"");
271 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
272 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
273 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
275 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
276 long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
279 static_assert(
sizeof(long) ==
sizeof(int),
"");
280 return static_cast<long>(__shfl_up(
static_cast<int>(var), lane_delta, width));
286unsigned long __shfl_up(
unsigned long var,
unsigned int lane_delta,
int width = warpSize)
289 static_assert(
sizeof(
unsigned long) == 2 *
sizeof(
unsigned int),
"");
290 static_assert(
sizeof(
unsigned long) ==
sizeof(uint64_t),
"");
292 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
293 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
294 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
296 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
297 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
300 static_assert(
sizeof(
unsigned long) ==
sizeof(
unsigned int),
"");
301 return static_cast<unsigned long>(__shfl_up(
static_cast<unsigned int>(var), lane_delta, width));
307long long __shfl_up(
long long var,
unsigned int lane_delta,
int width = warpSize)
309 static_assert(
sizeof(
long long) == 2 *
sizeof(int),
"");
310 static_assert(
sizeof(
long long) ==
sizeof(uint64_t),
"");
311 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
312 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
313 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
314 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
315 long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
321unsigned long long __shfl_up(
unsigned long long var,
unsigned int lane_delta,
int width = warpSize)
323 static_assert(
sizeof(
unsigned long long) == 2 *
sizeof(
unsigned int),
"");
324 static_assert(
sizeof(
unsigned long long) ==
sizeof(uint64_t),
"");
325 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
326 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
327 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
328 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
329 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
335int __shfl_down(
int var,
unsigned int lane_delta,
int width = warpSize) {
336 int self = __lane_id();
337 int index = self + lane_delta;
338 index = (int)((self&(width-1))+lane_delta) >= width?self:index;
339 return __builtin_amdgcn_ds_bpermute(index<<2, var);
343unsigned int __shfl_down(
unsigned int var,
unsigned int lane_delta,
int width = warpSize) {
344 union {
int i;
unsigned u;
float f; } tmp; tmp.u = var;
345 tmp.i = __shfl_down(tmp.i, lane_delta, width);
350float __shfl_down(
float var,
unsigned int lane_delta,
int width = warpSize) {
351 union {
int i;
unsigned u;
float f; } tmp; tmp.f = var;
352 tmp.i = __shfl_down(tmp.i, lane_delta, width);
357double __shfl_down(
double var,
unsigned int lane_delta,
int width = warpSize) {
358 static_assert(
sizeof(double) == 2 *
sizeof(int),
"");
359 static_assert(
sizeof(double) ==
sizeof(uint64_t),
"");
361 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
362 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
363 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
365 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
366 double tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
371long __shfl_down(
long var,
unsigned int lane_delta,
int width = warpSize)
374 static_assert(
sizeof(long) == 2 *
sizeof(int),
"");
375 static_assert(
sizeof(long) ==
sizeof(uint64_t),
"");
377 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
378 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
379 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
381 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
382 long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
385 static_assert(
sizeof(long) ==
sizeof(int),
"");
386 return static_cast<long>(__shfl_down(
static_cast<int>(var), lane_delta, width));
391unsigned long __shfl_down(
unsigned long var,
unsigned int lane_delta,
int width = warpSize)
394 static_assert(
sizeof(
unsigned long) == 2 *
sizeof(
unsigned int),
"");
395 static_assert(
sizeof(
unsigned long) ==
sizeof(uint64_t),
"");
397 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
398 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
399 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
401 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
402 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
405 static_assert(
sizeof(
unsigned long) ==
sizeof(
unsigned int),
"");
406 return static_cast<unsigned long>(__shfl_down(
static_cast<unsigned int>(var), lane_delta, width));
411long long __shfl_down(
long long var,
unsigned int lane_delta,
int width = warpSize)
413 static_assert(
sizeof(
long long) == 2 *
sizeof(int),
"");
414 static_assert(
sizeof(
long long) ==
sizeof(uint64_t),
"");
415 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
416 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
417 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
418 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
419 long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
424unsigned long long __shfl_down(
unsigned long long var,
unsigned int lane_delta,
int width = warpSize)
426 static_assert(
sizeof(
unsigned long long) == 2 *
sizeof(
unsigned int),
"");
427 static_assert(
sizeof(
unsigned long long) ==
sizeof(uint64_t),
"");
428 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
429 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
430 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
431 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
432 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
438int __shfl_xor(
int var,
int lane_mask,
int width = warpSize) {
439 int self = __lane_id();
440 int index = self^lane_mask;
441 index = index >= ((self+width)&~(width-1))?self:index;
442 return __builtin_amdgcn_ds_bpermute(index<<2, var);
446unsigned int __shfl_xor(
unsigned int var,
int lane_mask,
int width = warpSize) {
447 union {
int i;
unsigned u;
float f; } tmp; tmp.u = var;
448 tmp.i = __shfl_xor(tmp.i, lane_mask, width);
453float __shfl_xor(
float var,
int lane_mask,
int width = warpSize) {
454 union {
int i;
unsigned u;
float f; } tmp; tmp.f = var;
455 tmp.i = __shfl_xor(tmp.i, lane_mask, width);
460double __shfl_xor(
double var,
int lane_mask,
int width = warpSize) {
461 static_assert(
sizeof(double) == 2 *
sizeof(int),
"");
462 static_assert(
sizeof(double) ==
sizeof(uint64_t),
"");
464 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
465 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
466 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
468 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
469 double tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
474long __shfl_xor(
long var,
int lane_mask,
int width = warpSize)
477 static_assert(
sizeof(long) == 2 *
sizeof(int),
"");
478 static_assert(
sizeof(long) ==
sizeof(uint64_t),
"");
480 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
481 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
482 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
484 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
485 long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
488 static_assert(
sizeof(long) ==
sizeof(int),
"");
489 return static_cast<long>(__shfl_xor(
static_cast<int>(var), lane_mask, width));
494unsigned long __shfl_xor(
unsigned long var,
int lane_mask,
int width = warpSize)
497 static_assert(
sizeof(
unsigned long) == 2 *
sizeof(
unsigned int),
"");
498 static_assert(
sizeof(
unsigned long) ==
sizeof(uint64_t),
"");
500 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
501 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
502 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
504 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
505 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
508 static_assert(
sizeof(
unsigned long) ==
sizeof(
unsigned int),
"");
509 return static_cast<unsigned long>(__shfl_xor(
static_cast<unsigned int>(var), lane_mask, width));
514long long __shfl_xor(
long long var,
int lane_mask,
int width = warpSize)
516 static_assert(
sizeof(
long long) == 2 *
sizeof(int),
"");
517 static_assert(
sizeof(
long long) ==
sizeof(uint64_t),
"");
518 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
519 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
520 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
521 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
522 long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
527unsigned long long __shfl_xor(
unsigned long long var,
int lane_mask,
int width = warpSize)
529 static_assert(
sizeof(
unsigned long long) == 2 *
sizeof(
unsigned int),
"");
530 static_assert(
sizeof(
unsigned long long) ==
sizeof(uint64_t),
"");
531 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
532 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
533 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
534 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
535 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));