23#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
24#define HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
26#if !defined(__HIPCC_RTC__)
27#include <hip/amd_detail/amd_hip_common.h>
29#include <hip/amd_detail/hip_assert.h>
32#include <hip/hip_runtime_api.h>
34#include <hip/hip_vector_types.h>
37#if defined(__clang__) && defined(__HIP__)
38extern "C" __device__
int printf(
const char *fmt, ...);
40template <
typename... All>
41static inline __device__
void printf(
const char* format, All... all) {}
44extern "C" __device__
unsigned long long __ockl_steadyctr_u64();
51__device__
static inline unsigned int __popc(
unsigned int input) {
52 return __builtin_popcount(input);
54__device__
static inline unsigned int __popcll(
unsigned long long int input) {
55 return __builtin_popcountll(input);
58__device__
static inline int __clz(
int input) {
59 return __ockl_clz_u32((uint)input);
62__device__
static inline int __clzll(
long long int input) {
63 return __ockl_clz_u64((uint64_t)input);
66__device__
static inline unsigned int __ffs(
unsigned int input) {
67 return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
70__device__
static inline unsigned int __ffsll(
unsigned long long int input) {
71 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
74__device__
static inline unsigned int __ffsll(
unsigned long int input) {
75 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
78__device__
static inline unsigned int __ffs(
int input) {
79 return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
82__device__
static inline unsigned int __ffsll(
long long int input) {
83 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
86__device__
static inline unsigned int __ffsll(
long int input) {
87 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
93__device__
static int32_t __fns64(uint64_t mask, uint32_t base, int32_t offset) {
94 uint64_t temp_mask = mask;
95 int32_t temp_offset = offset;
98 temp_mask &= (1 << base);
101 else if (offset < 0) {
102 temp_mask = __builtin_bitreverse64(mask);
104 temp_offset = -offset;
107 temp_mask = temp_mask & ((~0ULL) << base);
108 if (__builtin_popcountll(temp_mask) < temp_offset)
111 for (
int i = 0x20; i > 0; i >>= 1) {
112 uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1);
113 int32_t pcnt = __builtin_popcountll(temp_mask_lo);
114 if (pcnt < temp_offset) {
115 temp_mask = temp_mask >> i;
120 temp_mask = temp_mask_lo;
129__device__
static int32_t __fns32(uint64_t mask, uint32_t base, int32_t offset) {
130 uint64_t temp_mask = mask;
131 int32_t temp_offset = offset;
133 temp_mask &= (1 << base);
136 else if (offset < 0) {
137 temp_mask = __builtin_bitreverse64(mask);
139 temp_offset = -offset;
141 temp_mask = temp_mask & ((~0ULL) << base);
142 if (__builtin_popcountll(temp_mask) < temp_offset)
145 for (
int i = 0x20; i > 0; i >>= 1) {
146 uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1);
147 int32_t pcnt = __builtin_popcountll(temp_mask_lo);
148 if (pcnt < temp_offset) {
149 temp_mask = temp_mask >> i;
154 temp_mask = temp_mask_lo;
162__device__
static inline unsigned int __brev(
unsigned int input) {
163 return __builtin_bitreverse32(input);
166__device__
static inline unsigned long long int __brevll(
unsigned long long int input) {
167 return __builtin_bitreverse64(input);
170__device__
static inline unsigned int __lastbit_u32_u64(uint64_t input) {
171 return input == 0 ? -1 : __builtin_ctzl(input);
174__device__
static inline unsigned int __bitextract_u32(
unsigned int src0,
unsigned int src1,
unsigned int src2) {
175 uint32_t offset = src1 & 31;
176 uint32_t width = src2 & 31;
177 return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
180__device__
static inline uint64_t __bitextract_u64(uint64_t src0,
unsigned int src1,
unsigned int src2) {
181 uint64_t offset = src1 & 63;
182 uint64_t width = src2 & 63;
183 return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
186__device__
static inline unsigned int __bitinsert_u32(
unsigned int src0,
unsigned int src1,
unsigned int src2,
unsigned int src3) {
187 uint32_t offset = src2 & 31;
188 uint32_t width = src3 & 31;
189 uint32_t mask = (1 << width) - 1;
190 return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
193__device__
static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1,
unsigned int src2,
unsigned int src3) {
194 uint64_t offset = src2 & 63;
195 uint64_t width = src3 & 63;
196 uint64_t mask = (1ULL << width) - 1;
197 return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
200__device__
inline unsigned int __funnelshift_l(
unsigned int lo,
unsigned int hi,
unsigned int shift)
202 uint32_t mask_shift = shift & 31;
203 return mask_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - mask_shift);
206__device__
inline unsigned int __funnelshift_lc(
unsigned int lo,
unsigned int hi,
unsigned int shift)
208 uint32_t min_shift = shift >= 32 ? 32 : shift;
209 return min_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - min_shift);
212__device__
inline unsigned int __funnelshift_r(
unsigned int lo,
unsigned int hi,
unsigned int shift)
214 return __builtin_amdgcn_alignbit(hi, lo, shift);
217__device__
inline unsigned int __funnelshift_rc(
unsigned int lo,
unsigned int hi,
unsigned int shift)
219 return shift >= 32 ? hi : __builtin_amdgcn_alignbit(hi, lo, shift);
222__device__
static unsigned int __byte_perm(
unsigned int x,
unsigned int y,
unsigned int s);
223__device__
static unsigned int __hadd(
int x,
int y);
224__device__
static int __mul24(
int x,
int y);
225__device__
static long long int __mul64hi(
long long int x,
long long int y);
226__device__
static int __mulhi(
int x,
int y);
227__device__
static int __rhadd(
int x,
int y);
228__device__
static unsigned int __sad(
int x,
int y,
unsigned int z);
229__device__
static unsigned int __uhadd(
unsigned int x,
unsigned int y);
230__device__
static int __umul24(
unsigned int x,
unsigned int y);
231__device__
static unsigned long long int __umul64hi(
unsigned long long int x,
unsigned long long int y);
232__device__
static unsigned int __umulhi(
unsigned int x,
unsigned int y);
233__device__
static unsigned int __urhadd(
unsigned int x,
unsigned int y);
234__device__
static unsigned int __usad(
unsigned int x,
unsigned int y,
unsigned int z);
241} __attribute__((aligned(4)));
248} __attribute__((aligned(8)));
251static inline unsigned int __byte_perm(
unsigned int x,
unsigned int y,
unsigned int s) {
258 result = cHoldVal.c[cHoldKey.c[0] & 0x07];
259 result += (cHoldVal.c[(cHoldKey.c[0] & 0x70) >> 4] << 8);
260 result += (cHoldVal.c[cHoldKey.c[1] & 0x07] << 16);
261 result += (cHoldVal.c[(cHoldKey.c[1] & 0x70) >> 4] << 24);
265__device__
static inline unsigned int __hadd(
int x,
int y) {
267 int sign = z & 0x8000000;
268 int value = z & 0x7FFFFFFF;
269 return ((value) >> 1 || sign);
272__device__
static inline int __mul24(
int x,
int y) {
273 return __ockl_mul24_i32(x, y);
276__device__
static inline long long __mul64hi(
long long int x,
long long int y) {
277 unsigned long long x0 = (
unsigned long long)x & 0xffffffffUL;
278 long long x1 = x >> 32;
279 unsigned long long y0 = (
unsigned long long)y & 0xffffffffUL;
280 long long y1 = y >> 32;
281 unsigned long long z0 = x0*y0;
282 long long t = x1*y0 + (z0 >> 32);
283 long long z1 = t & 0xffffffffL;
284 long long z2 = t >> 32;
286 return x1*y1 + z2 + (z1 >> 32);
289__device__
static inline int __mulhi(
int x,
int y) {
290 return __ockl_mul_hi_i32(x, y);
293__device__
static inline int __rhadd(
int x,
int y) {
295 int sign = z & 0x8000000;
296 int value = z & 0x7FFFFFFF;
297 return ((value) >> 1 || sign);
299__device__
static inline unsigned int __sad(
int x,
int y,
unsigned int z) {
300 return x > y ? x - y + z : y - x + z;
302__device__
static inline unsigned int __uhadd(
unsigned int x,
unsigned int y) {
305__device__
static inline int __umul24(
unsigned int x,
unsigned int y) {
306 return __ockl_mul24_u32(x, y);
310static inline unsigned long long __umul64hi(
unsigned long long int x,
unsigned long long int y) {
311 unsigned long long x0 = x & 0xffffffffUL;
312 unsigned long long x1 = x >> 32;
313 unsigned long long y0 = y & 0xffffffffUL;
314 unsigned long long y1 = y >> 32;
315 unsigned long long z0 = x0*y0;
316 unsigned long long t = x1*y0 + (z0 >> 32);
317 unsigned long long z1 = t & 0xffffffffUL;
318 unsigned long long z2 = t >> 32;
320 return x1*y1 + z2 + (z1 >> 32);
323__device__
static inline unsigned int __umulhi(
unsigned int x,
unsigned int y) {
324 return __ockl_mul_hi_u32(x, y);
326__device__
static inline unsigned int __urhadd(
unsigned int x,
unsigned int y) {
327 return (x + y + 1) >> 1;
329__device__
static inline unsigned int __usad(
unsigned int x,
unsigned int y,
unsigned int z) {
330 return __ockl_sadd_u32(x, y, z);
334static inline unsigned int __mbcnt_lo(
unsigned int x,
unsigned int y) {
return __builtin_amdgcn_mbcnt_lo(x,y);};
337static inline unsigned int __mbcnt_hi(
unsigned int x,
unsigned int y) {
return __builtin_amdgcn_mbcnt_hi(x,y);};
343#if !defined(__HIPCC_RTC__)
344#include "amd_warp_functions.h"
345#include "amd_warp_sync_functions.h"
348#define MASK1 0x00ff00ff
349#define MASK2 0xff00ff00
353 unsigned one1 = in1.w & MASK1;
354 unsigned one2 = in2.w & MASK1;
355 out.w = (one1 + one2) & MASK1;
356 one1 = in1.w & MASK2;
357 one2 = in2.w & MASK2;
358 out.w = out.w | ((one1 + one2) & MASK2);
364 unsigned one1 = in1.w & MASK1;
365 unsigned one2 = in2.w & MASK1;
366 out.w = (one1 - one2) & MASK1;
367 one1 = in1.w & MASK2;
368 one2 = in2.w & MASK2;
369 out.w = out.w | ((one1 - one2) & MASK2);
375 unsigned one1 = in1.w & MASK1;
376 unsigned one2 = in2.w & MASK1;
377 out.w = (one1 * one2) & MASK1;
378 one1 = in1.w & MASK2;
379 one2 = in2.w & MASK2;
380 out.w = out.w | ((one1 * one2) & MASK2);
384__device__
static inline float __double2float_rd(
double x) {
385 return __ocml_cvtrtn_f32_f64(x);
387__device__
static inline float __double2float_rn(
double x) {
return x; }
388__device__
static inline float __double2float_ru(
double x) {
389 return __ocml_cvtrtp_f32_f64(x);
391__device__
static inline float __double2float_rz(
double x) {
392 return __ocml_cvtrtz_f32_f64(x);
395__device__
static inline int __double2hiint(
double x) {
396 static_assert(
sizeof(double) == 2 *
sizeof(
int),
"");
399 __builtin_memcpy(tmp, &x,
sizeof(tmp));
403__device__
static inline int __double2loint(
double x) {
404 static_assert(
sizeof(double) == 2 *
sizeof(
int),
"");
407 __builtin_memcpy(tmp, &x,
sizeof(tmp));
412__device__
static inline int __double2int_rd(
double x) {
return (
int)__ocml_floor_f64(x); }
413__device__
static inline int __double2int_rn(
double x) {
return (
int)__ocml_rint_f64(x); }
414__device__
static inline int __double2int_ru(
double x) {
return (
int)__ocml_ceil_f64(x); }
415__device__
static inline int __double2int_rz(
double x) {
return (
int)x; }
417__device__
static inline long long int __double2ll_rd(
double x) {
418 return (
long long)__ocml_floor_f64(x);
420__device__
static inline long long int __double2ll_rn(
double x) {
421 return (
long long)__ocml_rint_f64(x);
423__device__
static inline long long int __double2ll_ru(
double x) {
424 return (
long long)__ocml_ceil_f64(x);
426__device__
static inline long long int __double2ll_rz(
double x) {
return (
long long)x; }
428__device__
static inline unsigned int __double2uint_rd(
double x) {
429 return (
unsigned int)__ocml_floor_f64(x);
431__device__
static inline unsigned int __double2uint_rn(
double x) {
432 return (
unsigned int)__ocml_rint_f64(x);
434__device__
static inline unsigned int __double2uint_ru(
double x) {
435 return (
unsigned int)__ocml_ceil_f64(x);
437__device__
static inline unsigned int __double2uint_rz(
double x) {
return (
unsigned int)x; }
439__device__
static inline unsigned long long int __double2ull_rd(
double x) {
440 return (
unsigned long long int)__ocml_floor_f64(x);
442__device__
static inline unsigned long long int __double2ull_rn(
double x) {
443 return (
unsigned long long int)__ocml_rint_f64(x);
445__device__
static inline unsigned long long int __double2ull_ru(
double x) {
446 return (
unsigned long long int)__ocml_ceil_f64(x);
448__device__
static inline unsigned long long int __double2ull_rz(
double x) {
449 return (
unsigned long long int)x;
451__device__
static inline long long int __double_as_longlong(
double x) {
452 static_assert(
sizeof(
long long) ==
sizeof(
double),
"");
455 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
474__device__
static inline int __float2int_rd(
float x) {
return (
int)__ocml_floor_f32(x); }
475__device__
static inline int __float2int_rn(
float x) {
return (
int)__ocml_rint_f32(x); }
476__device__
static inline int __float2int_ru(
float x) {
return (
int)__ocml_ceil_f32(x); }
477__device__
static inline int __float2int_rz(
float x) {
return (
int)__ocml_trunc_f32(x); }
479__device__
static inline long long int __float2ll_rd(
float x) {
480 return (
long long int)__ocml_floor_f32(x);
482__device__
static inline long long int __float2ll_rn(
float x) {
483 return (
long long int)__ocml_rint_f32(x);
485__device__
static inline long long int __float2ll_ru(
float x) {
486 return (
long long int)__ocml_ceil_f32(x);
488__device__
static inline long long int __float2ll_rz(
float x) {
return (
long long int)x; }
490__device__
static inline unsigned int __float2uint_rd(
float x) {
491 return (
unsigned int)__ocml_floor_f32(x);
493__device__
static inline unsigned int __float2uint_rn(
float x) {
494 return (
unsigned int)__ocml_rint_f32(x);
496__device__
static inline unsigned int __float2uint_ru(
float x) {
497 return (
unsigned int)__ocml_ceil_f32(x);
499__device__
static inline unsigned int __float2uint_rz(
float x) {
return (
unsigned int)x; }
501__device__
static inline unsigned long long int __float2ull_rd(
float x) {
502 return (
unsigned long long int)__ocml_floor_f32(x);
504__device__
static inline unsigned long long int __float2ull_rn(
float x) {
505 return (
unsigned long long int)__ocml_rint_f32(x);
507__device__
static inline unsigned long long int __float2ull_ru(
float x) {
508 return (
unsigned long long int)__ocml_ceil_f32(x);
510__device__
static inline unsigned long long int __float2ull_rz(
float x) {
511 return (
unsigned long long int)x;
514__device__
static inline int __float_as_int(
float x) {
515 static_assert(
sizeof(int) ==
sizeof(
float),
"");
518 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
523__device__
static inline unsigned int __float_as_uint(
float x) {
524 static_assert(
sizeof(
unsigned int) ==
sizeof(
float),
"");
527 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
532__device__
static inline double __hiloint2double(
int hi,
int lo) {
533 static_assert(
sizeof(double) ==
sizeof(uint64_t),
"");
535 uint64_t tmp0 = (
static_cast<uint64_t
>(hi) << 32ull) |
static_cast<uint32_t
>(lo);
537 __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
542__device__
static inline double __int2double_rn(
int x) {
return (
double)x; }
544__device__
static inline float __int2float_rd(
int x) {
545 return __ocml_cvtrtn_f32_s32(x);
547__device__
static inline float __int2float_rn(
int x) {
return (
float)x; }
548__device__
static inline float __int2float_ru(
int x) {
549 return __ocml_cvtrtp_f32_s32(x);
551__device__
static inline float __int2float_rz(
int x) {
552 return __ocml_cvtrtz_f32_s32(x);
555__device__
static inline float __int_as_float(
int x) {
556 static_assert(
sizeof(float) ==
sizeof(
int),
"");
559 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
564__device__
static inline double __ll2double_rd(
long long int x) {
565 return __ocml_cvtrtn_f64_s64(x);
567__device__
static inline double __ll2double_rn(
long long int x) {
return (
double)x; }
568__device__
static inline double __ll2double_ru(
long long int x) {
569 return __ocml_cvtrtp_f64_s64(x);
571__device__
static inline double __ll2double_rz(
long long int x) {
572 return __ocml_cvtrtz_f64_s64(x);
575__device__
static inline float __ll2float_rd(
long long int x) {
576 return __ocml_cvtrtn_f32_s64(x);
578__device__
static inline float __ll2float_rn(
long long int x) {
return (
float)x; }
579__device__
static inline float __ll2float_ru(
long long int x) {
580 return __ocml_cvtrtp_f32_s64(x);
582__device__
static inline float __ll2float_rz(
long long int x) {
583 return __ocml_cvtrtz_f32_s64(x);
586__device__
static inline double __longlong_as_double(
long long int x) {
587 static_assert(
sizeof(double) ==
sizeof(
long long),
"");
590 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
595__device__
static inline double __uint2double_rn(
unsigned int x) {
return (
double)x; }
597__device__
static inline float __uint2float_rd(
unsigned int x) {
598 return __ocml_cvtrtn_f32_u32(x);
600__device__
static inline float __uint2float_rn(
unsigned int x) {
return (
float)x; }
601__device__
static inline float __uint2float_ru(
unsigned int x) {
602 return __ocml_cvtrtp_f32_u32(x);
604__device__
static inline float __uint2float_rz(
unsigned int x) {
605 return __ocml_cvtrtz_f32_u32(x);
608__device__
static inline float __uint_as_float(
unsigned int x) {
609 static_assert(
sizeof(float) ==
sizeof(
unsigned int),
"");
612 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
617__device__
static inline double __ull2double_rd(
unsigned long long int x) {
618 return __ocml_cvtrtn_f64_u64(x);
620__device__
static inline double __ull2double_rn(
unsigned long long int x) {
return (
double)x; }
621__device__
static inline double __ull2double_ru(
unsigned long long int x) {
622 return __ocml_cvtrtp_f64_u64(x);
624__device__
static inline double __ull2double_rz(
unsigned long long int x) {
625 return __ocml_cvtrtz_f64_u64(x);
628__device__
static inline float __ull2float_rd(
unsigned long long int x) {
629 return __ocml_cvtrtn_f32_u64(x);
631__device__
static inline float __ull2float_rn(
unsigned long long int x) {
return (
float)x; }
632__device__
static inline float __ull2float_ru(
unsigned long long int x) {
633 return __ocml_cvtrtp_f32_u64(x);
635__device__
static inline float __ull2float_rz(
unsigned long long int x) {
636 return __ocml_cvtrtz_f32_u64(x);
639#if defined(__clang__) && defined(__HIP__)
642__device__
long long int __clock64();
643__device__
long long int __clock();
644__device__
long long int clock64();
645__device__
long long int clock();
646__device__
long long int wall_clock64();
648__device__
void __named_sync();
650#ifdef __HIP_DEVICE_COMPILE__
656inline __attribute((always_inline))
657long long int __clock64() {
658#if __has_builtin(__builtin_amdgcn_s_memtime)
660 return (
long long int) __builtin_amdgcn_s_memtime();
663 return (
long long int) __builtin_readcyclecounter();
668inline __attribute((always_inline))
669long long int __clock() {
return __clock64(); }
674inline __attribute__((always_inline))
675long long int wall_clock64() {
676 return (
long long int) __ockl_steadyctr_u64();
680inline __attribute__((always_inline))
681long long int clock64() {
return __clock64(); }
684inline __attribute__((always_inline))
685long long int clock() {
return __clock(); }
690void __named_sync() { __builtin_amdgcn_s_barrier(); }
697uint64_t __lanemask_gt()
699 uint32_t lane = __ockl_lane_u32();
702 uint64_t ballot = __ballot64(1);
703 uint64_t mask = (~((uint64_t)0)) << (lane + 1);
704 return mask & ballot;
709uint64_t __lanemask_lt()
711 uint32_t lane = __ockl_lane_u32();
712 int64_t ballot = __ballot64(1);
713 uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
714 return mask & ballot;
719uint64_t __lanemask_eq()
721 uint32_t lane = __ockl_lane_u32();
722 int64_t mask = ((uint64_t)1 << lane);
727__device__
inline void* __local_to_generic(
void* p) {
return p; }
729#ifdef __HIP_DEVICE_COMPILE__
732void* __get_dynamicgroupbaseptr()
735 return (
char*)__local_to_generic((
void*)__to_local(__builtin_amdgcn_groupstaticsize()));
739void* __get_dynamicgroupbaseptr();
744void *__amdgcn_get_dynamicgroupbaseptr() {
745 return __get_dynamicgroupbaseptr();
751static void __threadfence()
753 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST,
"agent");
758static void __threadfence_block()
760 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST,
"workgroup");
765static void __threadfence_system()
767 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST,
"");
769__device__
inline static void __work_group_barrier(__cl_mem_fence_flags flags) {
771 __builtin_amdgcn_fence(__ATOMIC_RELEASE,
"workgroup");
772 __builtin_amdgcn_s_barrier();
773 __builtin_amdgcn_fence(__ATOMIC_ACQUIRE,
"workgroup");
775 __builtin_amdgcn_s_barrier();
781static void __barrier(
int n)
783 __work_group_barrier((__cl_mem_fence_flags)n);
788__attribute__((convergent))
791 __barrier(__CLK_LOCAL_MEM_FENCE);
796__attribute__((convergent))
797int __syncthreads_count(
int predicate)
799 return __ockl_wgred_add_i32(!!predicate);
804__attribute__((convergent))
805int __syncthreads_and(
int predicate)
807 return __ockl_wgred_and_i32(!!predicate);
812__attribute__((convergent))
813int __syncthreads_or(
int predicate)
815 return __ockl_wgred_or_i32(!!predicate);
847#if (defined (__GFX10__) || defined (__GFX11__))
853#if (defined(__GFX10__) || defined(__GFX11__))
854 #define HW_ID_WGP_ID_SIZE 4
855 #define HW_ID_WGP_ID_OFFSET 10
856 #if (defined(__AMDGCN_CUMODE__))
857 #define HW_ID_CU_ID_SIZE 1
858 #define HW_ID_CU_ID_OFFSET 8
861 #define HW_ID_CU_ID_SIZE 4
862 #define HW_ID_CU_ID_OFFSET 8
865#if (defined(__gfx908__) || defined(__gfx90a__) || \
867 #define HW_ID_SE_ID_SIZE 3
869 #define HW_ID_SE_ID_SIZE 2
871#if (defined(__GFX10__) || defined(__GFX11__))
872 #define HW_ID_SE_ID_OFFSET 18
873 #define HW_ID_SA_ID_OFFSET 16
874 #define HW_ID_SA_ID_SIZE 1
876 #define HW_ID_SE_ID_OFFSET 13
879#if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
881 #define XCC_ID_XCC_ID_SIZE 4
882 #define XCC_ID_XCC_ID_OFFSET 0
885#if (!defined(__HIP_NO_IMAGE_SUPPORT) && \
886 (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)))
887 #define __HIP_NO_IMAGE_SUPPORT 1
897#define GETREG_IMMED(SZ,OFF,REG) (((SZ) << 11) | ((OFF) << 6) | (REG))
909 unsigned se_id = __builtin_amdgcn_s_getreg(
910 GETREG_IMMED(HW_ID_SE_ID_SIZE-1, HW_ID_SE_ID_OFFSET, HW_ID));
911 #if (defined(__GFX10__) || defined(__GFX11__))
912 unsigned wgp_id = __builtin_amdgcn_s_getreg(
913 GETREG_IMMED(HW_ID_WGP_ID_SIZE - 1, HW_ID_WGP_ID_OFFSET, HW_ID));
914 unsigned sa_id = __builtin_amdgcn_s_getreg(
915 GETREG_IMMED(HW_ID_SA_ID_SIZE - 1, HW_ID_SA_ID_OFFSET, HW_ID));
916 #if (defined(__AMDGCN_CUMODE__))
917 unsigned cu_id = __builtin_amdgcn_s_getreg(
918 GETREG_IMMED(HW_ID_CU_ID_SIZE - 1, HW_ID_CU_ID_OFFSET, HW_ID));
921 #if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
922 unsigned xcc_id = __builtin_amdgcn_s_getreg(
923 GETREG_IMMED(XCC_ID_XCC_ID_SIZE - 1, XCC_ID_XCC_ID_OFFSET, XCC_ID));
925 unsigned cu_id = __builtin_amdgcn_s_getreg(
926 GETREG_IMMED(HW_ID_CU_ID_SIZE - 1, HW_ID_CU_ID_OFFSET, HW_ID));
928 #if (defined(__GFX10__) || defined(__GFX11__))
929 unsigned temp = se_id;
930 temp = (temp << HW_ID_SA_ID_SIZE) | sa_id;
931 temp = (temp << HW_ID_WGP_ID_SIZE) | wgp_id;
932 #if (defined(__AMDGCN_CUMODE__))
933 temp = (temp << HW_ID_CU_ID_SIZE) | cu_id;
937 #elif (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
938 unsigned temp = xcc_id;
939 temp = (temp << HW_ID_SE_ID_SIZE) | se_id;
940 temp = (temp << HW_ID_CU_ID_SIZE) | cu_id;
943 return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
951#define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];
952#define HIP_DYNAMIC_SHARED_ATTRIBUTE
958static inline __device__
void* __hip_hc_memcpy(
void* dst,
const void* src,
size_t size) {
959 auto dstPtr =
static_cast<unsigned char*
>(dst);
960 auto srcPtr =
static_cast<const unsigned char*
>(src);
963 dstPtr[0] = srcPtr[0];
964 dstPtr[1] = srcPtr[1];
965 dstPtr[2] = srcPtr[2];
966 dstPtr[3] = srcPtr[3];
974 dstPtr[2] = srcPtr[2];
976 dstPtr[1] = srcPtr[1];
978 dstPtr[0] = srcPtr[0];
984static inline __device__
void* __hip_hc_memset(
void* dst,
unsigned char val,
size_t size) {
985 auto dstPtr =
static_cast<unsigned char*
>(dst);
1007#ifndef __OPENMP_AMDGCN__
1008static inline __device__
void* memcpy(
void* dst,
const void* src,
size_t size) {
1009 return __hip_hc_memcpy(dst, src, size);
1012static inline __device__
void* memset(
void* ptr,
int val,
size_t size) {
1013 unsigned char val8 =
static_cast<unsigned char>(val);
1014 return __hip_hc_memset(ptr, val8, size);
Contains declarations for types and functions in device library. Uses int64_t and uint64_t instead of...
Definition amd_device_functions.h:236
Definition amd_device_functions.h:243
Definition amd_hip_vector_types.h:1672