23#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
24#define HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
26#if !defined(__HIPCC_RTC__)
27#include <hip/amd_detail/amd_hip_common.h>
29#include <hip/amd_detail/hip_assert.h>
32#include <hip/hip_runtime_api.h>
34#include <hip/hip_vector_types.h>
37#if defined(__clang__) && defined(__HIP__)
38extern "C" __device__
int printf(
const char *fmt, ...);
40template <
typename... All>
41static inline __device__
void printf(
const char* format, All... all) {}
44extern "C" __device__
unsigned long long __ockl_steadyctr_u64();
51__device__
static inline unsigned int __popc(
unsigned int input) {
52 return __builtin_popcount(input);
54__device__
static inline unsigned int __popcll(
unsigned long long int input) {
55 return __builtin_popcountll(input);
58__device__
static inline int __clz(
int input) {
59 return __ockl_clz_u32((uint)input);
62__device__
static inline int __clzll(
long long int input) {
63 return __ockl_clz_u64((uint64_t)input);
66__device__
static inline int __ffs(
unsigned int input) {
67 return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
70__device__
static inline int __ffsll(
unsigned long long int input) {
71 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
74__device__
static inline int __ffs(
int input) {
75 return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
78__device__
static inline int __ffsll(
long long int input) {
79 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
82__device__
static inline unsigned int __ffsll(uint64_t input) {
83 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
89__device__
static int32_t __fns64(uint64_t mask, uint32_t base, int32_t offset) {
90 uint64_t temp_mask = mask;
91 int32_t temp_offset = offset;
94 temp_mask &= (1 << base);
97 else if (offset < 0) {
98 temp_mask = __builtin_bitreverse64(mask);
100 temp_offset = -offset;
103 temp_mask = temp_mask & ((~0ULL) << base);
104 if (__builtin_popcountll(temp_mask) < temp_offset)
107 for (
int i = 0x20; i > 0; i >>= 1) {
108 uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1);
109 int32_t pcnt = __builtin_popcountll(temp_mask_lo);
110 if (pcnt < temp_offset) {
111 temp_mask = temp_mask >> i;
116 temp_mask = temp_mask_lo;
125__device__
static int32_t __fns32(uint32_t mask, uint32_t base, int32_t offset) {
126 uint32_t temp_mask = mask;
127 int32_t temp_offset = offset;
129 temp_mask &= (1 << base);
132 else if (offset < 0) {
133 temp_mask = __builtin_bitreverse32(mask);
135 temp_offset = -offset;
137 temp_mask = temp_mask & ((~0U) << base);
138 if (__builtin_popcount(temp_mask) < temp_offset)
141 for (
int i = 0x10; i > 0; i >>= 1) {
142 uint32_t temp_mask_lo = temp_mask & ((1U << i) - 1);
143 int32_t pcnt = __builtin_popcount(temp_mask_lo);
144 if (pcnt < temp_offset) {
145 temp_mask = temp_mask >> i;
150 temp_mask = temp_mask_lo;
160__device__
static int32_t __fns(
unsigned int mask,
unsigned int base,
int offset) {
161 return __fns32(mask, base, offset);
164__device__
static inline unsigned int __brev(
unsigned int input) {
165 return __builtin_bitreverse32(input);
168__device__
static inline unsigned long long int __brevll(
unsigned long long int input) {
169 return __builtin_bitreverse64(input);
172__device__
static inline unsigned int __lastbit_u32_u64(uint64_t input) {
173 return input == 0 ? -1 : __builtin_ctzl(input);
176__device__
static inline unsigned int __bitextract_u32(
unsigned int src0,
unsigned int src1,
unsigned int src2) {
177 uint32_t offset = src1 & 31;
178 uint32_t width = src2 & 31;
179 return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
182__device__
static inline uint64_t __bitextract_u64(uint64_t src0,
unsigned int src1,
unsigned int src2) {
183 uint64_t offset = src1 & 63;
184 uint64_t width = src2 & 63;
185 return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
188__device__
static inline unsigned int __bitinsert_u32(
unsigned int src0,
unsigned int src1,
unsigned int src2,
unsigned int src3) {
189 uint32_t offset = src2 & 31;
190 uint32_t width = src3 & 31;
191 uint32_t mask = (1 << width) - 1;
192 return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
195__device__
static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1,
unsigned int src2,
unsigned int src3) {
196 uint64_t offset = src2 & 63;
197 uint64_t width = src3 & 63;
198 uint64_t mask = (1ULL << width) - 1;
199 return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
202__device__
inline unsigned int __funnelshift_l(
unsigned int lo,
unsigned int hi,
unsigned int shift)
204 uint32_t mask_shift = shift & 31;
205 return mask_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - mask_shift);
208__device__
inline unsigned int __funnelshift_lc(
unsigned int lo,
unsigned int hi,
unsigned int shift)
210 uint32_t min_shift = shift >= 32 ? 32 : shift;
211 return min_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - min_shift);
214__device__
inline unsigned int __funnelshift_r(
unsigned int lo,
unsigned int hi,
unsigned int shift)
216 return __builtin_amdgcn_alignbit(hi, lo, shift);
219__device__
inline unsigned int __funnelshift_rc(
unsigned int lo,
unsigned int hi,
unsigned int shift)
221 return shift >= 32 ? hi : __builtin_amdgcn_alignbit(hi, lo, shift);
224__device__
static unsigned int __byte_perm(
unsigned int x,
unsigned int y,
unsigned int s);
225__device__
static unsigned int __hadd(
int x,
int y);
226__device__
static int __mul24(
int x,
int y);
227__device__
static long long int __mul64hi(
long long int x,
long long int y);
228__device__
static int __mulhi(
int x,
int y);
229__device__
static int __rhadd(
int x,
int y);
230__device__
static unsigned int __sad(
int x,
int y,
unsigned int z);
231__device__
static unsigned int __uhadd(
unsigned int x,
unsigned int y);
232__device__
static int __umul24(
unsigned int x,
unsigned int y);
233__device__
static unsigned long long int __umul64hi(
unsigned long long int x,
unsigned long long int y);
234__device__
static unsigned int __umulhi(
unsigned int x,
unsigned int y);
235__device__
static unsigned int __urhadd(
unsigned int x,
unsigned int y);
236__device__
static unsigned int __usad(
unsigned int x,
unsigned int y,
unsigned int z);
253static inline unsigned int __byte_perm(
unsigned int x,
unsigned int y,
unsigned int s) {
260 result = cHoldVal.c[cHoldKey.c[0] & 0x07];
261 result += (cHoldVal.c[(cHoldKey.c[0] & 0x70) >> 4] << 8);
262 result += (cHoldVal.c[cHoldKey.c[1] & 0x07] << 16);
263 result += (cHoldVal.c[(cHoldKey.c[1] & 0x70) >> 4] << 24);
267__device__
static inline unsigned int __hadd(
int x,
int y) {
269 int sign = z & 0x8000000;
270 int value = z & 0x7FFFFFFF;
271 return ((value) >> 1 || sign);
274__device__
static inline int __mul24(
int x,
int y) {
275 return __ockl_mul24_i32(x, y);
278__device__
static inline long long __mul64hi(
long long int x,
long long int y) {
279 unsigned long long x0 = (
unsigned long long)x & 0xffffffffUL;
280 long long x1 = x >> 32;
281 unsigned long long y0 = (
unsigned long long)y & 0xffffffffUL;
282 long long y1 = y >> 32;
283 unsigned long long z0 = x0*y0;
284 long long t = x1*y0 + (z0 >> 32);
285 long long z1 = t & 0xffffffffL;
286 long long z2 = t >> 32;
288 return x1*y1 + z2 + (z1 >> 32);
291__device__
static inline int __mulhi(
int x,
int y) {
292 return __ockl_mul_hi_i32(x, y);
295__device__
static inline int __rhadd(
int x,
int y) {
297 int sign = z & 0x8000000;
298 int value = z & 0x7FFFFFFF;
299 return ((value) >> 1 || sign);
301__device__
static inline unsigned int __sad(
int x,
int y,
unsigned int z) {
302 return x > y ? x - y + z : y - x + z;
304__device__
static inline unsigned int __uhadd(
unsigned int x,
unsigned int y) {
307__device__
static inline int __umul24(
unsigned int x,
unsigned int y) {
308 return __ockl_mul24_u32(x, y);
312static inline unsigned long long __umul64hi(
unsigned long long int x,
unsigned long long int y) {
313 unsigned long long x0 = x & 0xffffffffUL;
314 unsigned long long x1 = x >> 32;
315 unsigned long long y0 = y & 0xffffffffUL;
316 unsigned long long y1 = y >> 32;
317 unsigned long long z0 = x0*y0;
318 unsigned long long t = x1*y0 + (z0 >> 32);
319 unsigned long long z1 = t & 0xffffffffUL;
320 unsigned long long z2 = t >> 32;
322 return x1*y1 + z2 + (z1 >> 32);
325__device__
static inline unsigned int __umulhi(
unsigned int x,
unsigned int y) {
326 return __ockl_mul_hi_u32(x, y);
328__device__
static inline unsigned int __urhadd(
unsigned int x,
unsigned int y) {
329 return (x + y + 1) >> 1;
331__device__
static inline unsigned int __usad(
unsigned int x,
unsigned int y,
unsigned int z) {
332 return __ockl_sadd_u32(x, y, z);
336static inline unsigned int __mbcnt_lo(
unsigned int x,
unsigned int y) {
return __builtin_amdgcn_mbcnt_lo(x,y);};
339static inline unsigned int __mbcnt_hi(
unsigned int x,
unsigned int y) {
return __builtin_amdgcn_mbcnt_hi(x,y);};
345#if !defined(__HIPCC_RTC__)
346#include "amd_warp_functions.h"
347#include "amd_warp_sync_functions.h"
350#define MASK1 0x00ff00ff
351#define MASK2 0xff00ff00
355 unsigned one1 = in1.w & MASK1;
356 unsigned one2 = in2.w & MASK1;
357 out.w = (one1 + one2) & MASK1;
358 one1 = in1.w & MASK2;
359 one2 = in2.w & MASK2;
360 out.w = out.w | ((one1 + one2) & MASK2);
366 unsigned one1 = in1.w & MASK1;
367 unsigned one2 = in2.w & MASK1;
368 out.w = (one1 - one2) & MASK1;
369 one1 = in1.w & MASK2;
370 one2 = in2.w & MASK2;
371 out.w = out.w | ((one1 - one2) & MASK2);
377 unsigned one1 = in1.w & MASK1;
378 unsigned one2 = in2.w & MASK1;
379 out.w = (one1 * one2) & MASK1;
380 one1 = in1.w & MASK2;
381 one2 = in2.w & MASK2;
382 out.w = out.w | ((one1 * one2) & MASK2);
386__device__
static inline float __double2float_rd(
double x) {
387 return __ocml_cvtrtn_f32_f64(x);
389__device__
static inline float __double2float_rn(
double x) {
return x; }
390__device__
static inline float __double2float_ru(
double x) {
391 return __ocml_cvtrtp_f32_f64(x);
393__device__
static inline float __double2float_rz(
double x) {
394 return __ocml_cvtrtz_f32_f64(x);
397__device__
static inline int __double2hiint(
double x) {
398 static_assert(
sizeof(double) == 2 *
sizeof(int),
"");
401 __builtin_memcpy(tmp, &x,
sizeof(tmp));
405__device__
static inline int __double2loint(
double x) {
406 static_assert(
sizeof(double) == 2 *
sizeof(int),
"");
409 __builtin_memcpy(tmp, &x,
sizeof(tmp));
414__device__
static inline int __double2int_rd(
double x) {
return (
int)__ocml_floor_f64(x); }
415__device__
static inline int __double2int_rn(
double x) {
return (
int)__ocml_rint_f64(x); }
416__device__
static inline int __double2int_ru(
double x) {
return (
int)__ocml_ceil_f64(x); }
417__device__
static inline int __double2int_rz(
double x) {
return (
int)x; }
419__device__
static inline long long int __double2ll_rd(
double x) {
420 return (
long long)__ocml_floor_f64(x);
422__device__
static inline long long int __double2ll_rn(
double x) {
423 return (
long long)__ocml_rint_f64(x);
425__device__
static inline long long int __double2ll_ru(
double x) {
426 return (
long long)__ocml_ceil_f64(x);
428__device__
static inline long long int __double2ll_rz(
double x) {
return (
long long)x; }
430__device__
static inline unsigned int __double2uint_rd(
double x) {
431 return (
unsigned int)__ocml_floor_f64(x);
433__device__
static inline unsigned int __double2uint_rn(
double x) {
434 return (
unsigned int)__ocml_rint_f64(x);
436__device__
static inline unsigned int __double2uint_ru(
double x) {
437 return (
unsigned int)__ocml_ceil_f64(x);
439__device__
static inline unsigned int __double2uint_rz(
double x) {
return (
unsigned int)x; }
441__device__
static inline unsigned long long int __double2ull_rd(
double x) {
442 return (
unsigned long long int)__ocml_floor_f64(x);
444__device__
static inline unsigned long long int __double2ull_rn(
double x) {
445 return (
unsigned long long int)__ocml_rint_f64(x);
447__device__
static inline unsigned long long int __double2ull_ru(
double x) {
448 return (
unsigned long long int)__ocml_ceil_f64(x);
450__device__
static inline unsigned long long int __double2ull_rz(
double x) {
451 return (
unsigned long long int)x;
453__device__
static inline long long int __double_as_longlong(
double x) {
454 static_assert(
sizeof(
long long) ==
sizeof(double),
"");
457 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
476__device__
static inline int __float2int_rd(
float x) {
return (
int)__ocml_floor_f32(x); }
477__device__
static inline int __float2int_rn(
float x) {
return (
int)__ocml_rint_f32(x); }
478__device__
static inline int __float2int_ru(
float x) {
return (
int)__ocml_ceil_f32(x); }
479__device__
static inline int __float2int_rz(
float x) {
return (
int)__ocml_trunc_f32(x); }
481__device__
static inline long long int __float2ll_rd(
float x) {
482 return (
long long int)__ocml_floor_f32(x);
484__device__
static inline long long int __float2ll_rn(
float x) {
485 return (
long long int)__ocml_rint_f32(x);
487__device__
static inline long long int __float2ll_ru(
float x) {
488 return (
long long int)__ocml_ceil_f32(x);
490__device__
static inline long long int __float2ll_rz(
float x) {
return (
long long int)x; }
492__device__
static inline unsigned int __float2uint_rd(
float x) {
493 return (
unsigned int)__ocml_floor_f32(x);
495__device__
static inline unsigned int __float2uint_rn(
float x) {
496 return (
unsigned int)__ocml_rint_f32(x);
498__device__
static inline unsigned int __float2uint_ru(
float x) {
499 return (
unsigned int)__ocml_ceil_f32(x);
501__device__
static inline unsigned int __float2uint_rz(
float x) {
return (
unsigned int)x; }
503__device__
static inline unsigned long long int __float2ull_rd(
float x) {
504 return (
unsigned long long int)__ocml_floor_f32(x);
506__device__
static inline unsigned long long int __float2ull_rn(
float x) {
507 return (
unsigned long long int)__ocml_rint_f32(x);
509__device__
static inline unsigned long long int __float2ull_ru(
float x) {
510 return (
unsigned long long int)__ocml_ceil_f32(x);
512__device__
static inline unsigned long long int __float2ull_rz(
float x) {
513 return (
unsigned long long int)x;
516__device__
static inline int __float_as_int(
float x) {
517 static_assert(
sizeof(int) ==
sizeof(float),
"");
520 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
525__device__
static inline unsigned int __float_as_uint(
float x) {
526 static_assert(
sizeof(
unsigned int) ==
sizeof(float),
"");
529 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
534__device__
static inline double __hiloint2double(
int hi,
int lo) {
535 static_assert(
sizeof(double) ==
sizeof(uint64_t),
"");
537 uint64_t tmp0 = (
static_cast<uint64_t
>(hi) << 32ull) |
static_cast<uint32_t
>(lo);
539 __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
544__device__
static inline double __int2double_rn(
int x) {
return (
double)x; }
546__device__
static inline float __int2float_rd(
int x) {
547 return __ocml_cvtrtn_f32_s32(x);
549__device__
static inline float __int2float_rn(
int x) {
return (
float)x; }
550__device__
static inline float __int2float_ru(
int x) {
551 return __ocml_cvtrtp_f32_s32(x);
553__device__
static inline float __int2float_rz(
int x) {
554 return __ocml_cvtrtz_f32_s32(x);
557__device__
static inline float __int_as_float(
int x) {
558 static_assert(
sizeof(float) ==
sizeof(int),
"");
561 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
566__device__
static inline double __ll2double_rd(
long long int x) {
567 return __ocml_cvtrtn_f64_s64(x);
569__device__
static inline double __ll2double_rn(
long long int x) {
return (
double)x; }
570__device__
static inline double __ll2double_ru(
long long int x) {
571 return __ocml_cvtrtp_f64_s64(x);
573__device__
static inline double __ll2double_rz(
long long int x) {
574 return __ocml_cvtrtz_f64_s64(x);
577__device__
static inline float __ll2float_rd(
long long int x) {
578 return __ocml_cvtrtn_f32_s64(x);
580__device__
static inline float __ll2float_rn(
long long int x) {
return (
float)x; }
581__device__
static inline float __ll2float_ru(
long long int x) {
582 return __ocml_cvtrtp_f32_s64(x);
584__device__
static inline float __ll2float_rz(
long long int x) {
585 return __ocml_cvtrtz_f32_s64(x);
588__device__
static inline double __longlong_as_double(
long long int x) {
589 static_assert(
sizeof(double) ==
sizeof(
long long),
"");
592 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
597__device__
static inline double __uint2double_rn(
unsigned int x) {
return (
double)x; }
599__device__
static inline float __uint2float_rd(
unsigned int x) {
600 return __ocml_cvtrtn_f32_u32(x);
602__device__
static inline float __uint2float_rn(
unsigned int x) {
return (
float)x; }
603__device__
static inline float __uint2float_ru(
unsigned int x) {
604 return __ocml_cvtrtp_f32_u32(x);
606__device__
static inline float __uint2float_rz(
unsigned int x) {
607 return __ocml_cvtrtz_f32_u32(x);
610__device__
static inline float __uint_as_float(
unsigned int x) {
611 static_assert(
sizeof(float) ==
sizeof(
unsigned int),
"");
614 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
619__device__
static inline double __ull2double_rd(
unsigned long long int x) {
620 return __ocml_cvtrtn_f64_u64(x);
622__device__
static inline double __ull2double_rn(
unsigned long long int x) {
return (
double)x; }
623__device__
static inline double __ull2double_ru(
unsigned long long int x) {
624 return __ocml_cvtrtp_f64_u64(x);
626__device__
static inline double __ull2double_rz(
unsigned long long int x) {
627 return __ocml_cvtrtz_f64_u64(x);
630__device__
static inline float __ull2float_rd(
unsigned long long int x) {
631 return __ocml_cvtrtn_f32_u64(x);
633__device__
static inline float __ull2float_rn(
unsigned long long int x) {
return (
float)x; }
634__device__
static inline float __ull2float_ru(
unsigned long long int x) {
635 return __ocml_cvtrtp_f32_u64(x);
637__device__
static inline float __ull2float_rz(
unsigned long long int x) {
638 return __ocml_cvtrtz_f32_u64(x);
641#if defined(__clang__) && defined(__HIP__)
644__device__
long long int __clock64();
645__device__
long long int __clock();
646__device__
long long int clock64();
647__device__
long long int clock();
648__device__
long long int wall_clock64();
650__device__
void __named_sync();
652#ifdef __HIP_DEVICE_COMPILE__
658inline __attribute((always_inline))
659long long int __clock64() {
660#if __has_builtin(__builtin_amdgcn_s_memtime)
662 return (
long long int) __builtin_amdgcn_s_memtime();
665 return (
long long int) __builtin_readcyclecounter();
670inline __attribute((always_inline))
671long long int __clock() {
return __clock64(); }
677long long int wall_clock64() {
678 return (
long long int) __ockl_steadyctr_u64();
683long long int clock64() {
return __clock64(); }
687long long int clock() {
return __clock(); }
692void __named_sync() { __builtin_amdgcn_s_barrier(); }
699uint64_t __lanemask_gt()
701 uint32_t lane = __ockl_lane_u32();
704 uint64_t ballot = __ballot64(1);
705 uint64_t mask = (~((uint64_t)0)) << (lane + 1);
706 return mask & ballot;
711uint64_t __lanemask_lt()
713 uint32_t lane = __ockl_lane_u32();
714 int64_t ballot = __ballot64(1);
715 uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
716 return mask & ballot;
721uint64_t __lanemask_eq()
723 uint32_t lane = __ockl_lane_u32();
724 int64_t mask = ((uint64_t)1 << lane);
729__device__
inline void* __local_to_generic(
void* p) {
return p; }
731#ifdef __HIP_DEVICE_COMPILE__
734void* __get_dynamicgroupbaseptr()
737 return (
char*)__local_to_generic((
void*)__to_local(__builtin_amdgcn_groupstaticsize()));
741void* __get_dynamicgroupbaseptr();
746void *__amdgcn_get_dynamicgroupbaseptr() {
747 return __get_dynamicgroupbaseptr();
753static void __threadfence()
755 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST,
"agent");
760static void __threadfence_block()
762 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST,
"workgroup");
767static void __threadfence_system()
769 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST,
"");
771__device__
inline static void __work_group_barrier(__cl_mem_fence_flags flags) {
773 __builtin_amdgcn_fence(__ATOMIC_RELEASE,
"workgroup");
774 __builtin_amdgcn_s_barrier();
775 __builtin_amdgcn_fence(__ATOMIC_ACQUIRE,
"workgroup");
777 __builtin_amdgcn_s_barrier();
783static void __barrier(
int n)
785 __work_group_barrier((__cl_mem_fence_flags)n);
793 __barrier(__CLK_LOCAL_MEM_FENCE);
799int __syncthreads_count(
int predicate)
801 return __ockl_wgred_add_i32(!!predicate);
807int __syncthreads_and(
int predicate)
809 return __ockl_wgred_and_i32(!!predicate);
815int __syncthreads_or(
int predicate)
817 return __ockl_wgred_or_i32(!!predicate);
849#if (defined (__GFX10__) || defined (__GFX11__))
855#if (defined(__GFX10__) || defined(__GFX11__))
856 #define HW_ID_WGP_ID_SIZE 4
857 #define HW_ID_WGP_ID_OFFSET 10
858 #if (defined(__AMDGCN_CUMODE__))
859 #define HW_ID_CU_ID_SIZE 1
860 #define HW_ID_CU_ID_OFFSET 8
863 #define HW_ID_CU_ID_SIZE 4
864 #define HW_ID_CU_ID_OFFSET 8
867#if (defined(__gfx908__) || defined(__gfx90a__) || \
869 #define HW_ID_SE_ID_SIZE 3
871 #define HW_ID_SE_ID_SIZE 2
873#if (defined(__GFX10__) || defined(__GFX11__))
874 #define HW_ID_SE_ID_OFFSET 18
875 #define HW_ID_SA_ID_OFFSET 16
876 #define HW_ID_SA_ID_SIZE 1
878 #define HW_ID_SE_ID_OFFSET 13
881#if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
883 #define XCC_ID_XCC_ID_SIZE 4
884 #define XCC_ID_XCC_ID_OFFSET 0
887#if (!defined(__HIP_NO_IMAGE_SUPPORT) && \
888 (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)))
889 #define __HIP_NO_IMAGE_SUPPORT 1
899#define GETREG_IMMED(SZ,OFF,REG) (((SZ) << 11) | ((OFF) << 6) | (REG))
911 unsigned se_id = __builtin_amdgcn_s_getreg(
912 GETREG_IMMED(HW_ID_SE_ID_SIZE-1, HW_ID_SE_ID_OFFSET, HW_ID));
913 #if (defined(__GFX10__) || defined(__GFX11__))
914 unsigned wgp_id = __builtin_amdgcn_s_getreg(
915 GETREG_IMMED(HW_ID_WGP_ID_SIZE - 1, HW_ID_WGP_ID_OFFSET, HW_ID));
916 unsigned sa_id = __builtin_amdgcn_s_getreg(
917 GETREG_IMMED(HW_ID_SA_ID_SIZE - 1, HW_ID_SA_ID_OFFSET, HW_ID));
918 #if (defined(__AMDGCN_CUMODE__))
919 unsigned cu_id = __builtin_amdgcn_s_getreg(
920 GETREG_IMMED(HW_ID_CU_ID_SIZE - 1, HW_ID_CU_ID_OFFSET, HW_ID));
923 #if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
924 unsigned xcc_id = __builtin_amdgcn_s_getreg(
925 GETREG_IMMED(XCC_ID_XCC_ID_SIZE - 1, XCC_ID_XCC_ID_OFFSET, XCC_ID));
927 unsigned cu_id = __builtin_amdgcn_s_getreg(
928 GETREG_IMMED(HW_ID_CU_ID_SIZE - 1, HW_ID_CU_ID_OFFSET, HW_ID));
930 #if (defined(__GFX10__) || defined(__GFX11__))
931 unsigned temp = se_id;
932 temp = (temp << HW_ID_SA_ID_SIZE) | sa_id;
933 temp = (temp << HW_ID_WGP_ID_SIZE) | wgp_id;
934 #if (defined(__AMDGCN_CUMODE__))
935 temp = (temp << HW_ID_CU_ID_SIZE) | cu_id;
939 #elif (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
940 unsigned temp = xcc_id;
941 temp = (temp << HW_ID_SE_ID_SIZE) | se_id;
942 temp = (temp << HW_ID_CU_ID_SIZE) | cu_id;
945 return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
953#define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];
954#define HIP_DYNAMIC_SHARED_ATTRIBUTE
960static inline __device__
void* __hip_hc_memcpy(
void* dst,
const void* src,
size_t size) {
961 auto dstPtr =
static_cast<unsigned char*
>(dst);
962 auto srcPtr =
static_cast<const unsigned char*
>(src);
965 dstPtr[0] = srcPtr[0];
966 dstPtr[1] = srcPtr[1];
967 dstPtr[2] = srcPtr[2];
968 dstPtr[3] = srcPtr[3];
976 dstPtr[2] = srcPtr[2];
978 dstPtr[1] = srcPtr[1];
980 dstPtr[0] = srcPtr[0];
986static inline __device__
void* __hip_hc_memset(
void* dst,
unsigned char val,
size_t size) {
987 auto dstPtr =
static_cast<unsigned char*
>(dst);
1009#ifndef __OPENMP_AMDGCN__
1010static inline __device__
void* memcpy(
void* dst,
const void* src,
size_t size) {
1011 return __hip_hc_memcpy(dst, src, size);
1014static inline __device__
void* memset(
void* ptr,
int val,
size_t size) {
1015 unsigned char val8 =
static_cast<unsigned char>(val);
1016 return __hip_hc_memset(ptr, val8, size);
Contains declarations for types and functions in device library. Uses int64_t and uint64_t instead of...
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Definition hip_fp16_math_fwd.h:57
Definition amd_device_functions.h:238
Definition amd_device_functions.h:245
Definition amd_hip_vector_types.h:1672