HIP: Heterogenous-computing Interface for Portability
Loading...
Searching...
No Matches
amd_device_functions.h
1/*
2Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
3
4Permission is hereby granted, free of charge, to any person obtaining a copy
5of this software and associated documentation files (the "Software"), to deal
6in the Software without restriction, including without limitation the rights
7to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8copies of the Software, and to permit persons to whom the Software is
9furnished to do so, subject to the following conditions:
10
11The above copyright notice and this permission notice shall be included in
12all copies or substantial portions of the Software.
13
14THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20THE SOFTWARE.
21*/
22
23#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
24#define HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
25
26#if !defined(__HIPCC_RTC__)
27#include <hip/amd_detail/amd_hip_common.h>
29#include <hip/amd_detail/hip_assert.h>
30#include "host_defines.h"
31#include "math_fwd.h"
32#include <hip/hip_runtime_api.h>
33#include <stddef.h>
34#include <hip/hip_vector_types.h>
35#endif // !defined(__HIPCC_RTC__)
36
37#if defined(__clang__) && defined(__HIP__)
38extern "C" __device__ int printf(const char *fmt, ...);
39#else
40template <typename... All>
41static inline __device__ void printf(const char* format, All... all) {}
42#endif
43
44extern "C" __device__ unsigned long long __ockl_steadyctr_u64();
45
46/*
47Integer Intrinsics
48*/
49
50// integer intrinsic function __poc __clz __ffs __brev
51__device__ static inline unsigned int __popc(unsigned int input) {
52 return __builtin_popcount(input);
53}
54__device__ static inline unsigned int __popcll(unsigned long long int input) {
55 return __builtin_popcountll(input);
56}
57
58__device__ static inline int __clz(int input) {
59 return __ockl_clz_u32((uint)input);
60}
61
62__device__ static inline int __clzll(long long int input) {
63 return __ockl_clz_u64((uint64_t)input);
64}
65
66__device__ static inline int __ffs(unsigned int input) {
67 return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
68}
69
70__device__ static inline int __ffsll(unsigned long long int input) {
71 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
72}
73
74__device__ static inline int __ffs(int input) {
75 return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
76}
77
78__device__ static inline int __ffsll(long long int input) {
79 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
80}
81
82__device__ static inline unsigned int __ffsll(uint64_t input) {
83 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
84}
85
86// Given a 32/64-bit value exec mask and an integer value base (between 0 and WAVEFRONT_SIZE),
87// find the n-th (given by offset) set bit in the exec mask from the base bit, and return the bit position.
88// If not found, return -1.
89__device__ static int32_t __fns64(uint64_t mask, uint32_t base, int32_t offset) {
90 uint64_t temp_mask = mask;
91 int32_t temp_offset = offset;
92
93 if (offset == 0) {
94 temp_mask &= (1 << base);
95 temp_offset = 1;
96 }
97 else if (offset < 0) {
98 temp_mask = __builtin_bitreverse64(mask);
99 base = 63 - base;
100 temp_offset = -offset;
101 }
102
103 temp_mask = temp_mask & ((~0ULL) << base);
104 if (__builtin_popcountll(temp_mask) < temp_offset)
105 return -1;
106 int32_t total = 0;
107 for (int i = 0x20; i > 0; i >>= 1) {
108 uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1);
109 int32_t pcnt = __builtin_popcountll(temp_mask_lo);
110 if (pcnt < temp_offset) {
111 temp_mask = temp_mask >> i;
112 temp_offset -= pcnt;
113 total += i;
114 }
115 else {
116 temp_mask = temp_mask_lo;
117 }
118 }
119 if (offset < 0)
120 return 63 - total;
121 else
122 return total;
123}
124
125__device__ static int32_t __fns32(uint32_t mask, uint32_t base, int32_t offset) {
126 uint32_t temp_mask = mask;
127 int32_t temp_offset = offset;
128 if (offset == 0) {
129 temp_mask &= (1 << base);
130 temp_offset = 1;
131 }
132 else if (offset < 0) {
133 temp_mask = __builtin_bitreverse32(mask);
134 base = 31 - base;
135 temp_offset = -offset;
136 }
137 temp_mask = temp_mask & ((~0U) << base);
138 if (__builtin_popcount(temp_mask) < temp_offset)
139 return -1;
140 int32_t total = 0;
141 for (int i = 0x10; i > 0; i >>= 1) {
142 uint32_t temp_mask_lo = temp_mask & ((1U << i) - 1);
143 int32_t pcnt = __builtin_popcount(temp_mask_lo);
144 if (pcnt < temp_offset) {
145 temp_mask = temp_mask >> i;
146 temp_offset -= pcnt;
147 total += i;
148 }
149 else {
150 temp_mask = temp_mask_lo;
151 }
152 }
153 if (offset < 0)
154 return 31 - total;
155 else
156 return total;
157}
158
159// Wrapper around __fns32() to make porting from CUDA easier
160__device__ static int32_t __fns(unsigned int mask, unsigned int base, int offset) {
161 return __fns32(mask, base, offset);
162}
163
164__device__ static inline unsigned int __brev(unsigned int input) {
165 return __builtin_bitreverse32(input);
166}
167
168__device__ static inline unsigned long long int __brevll(unsigned long long int input) {
169 return __builtin_bitreverse64(input);
170}
171
172__device__ static inline unsigned int __lastbit_u32_u64(uint64_t input) {
173 return input == 0 ? -1 : __builtin_ctzl(input);
174}
175
176__device__ static inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) {
177 uint32_t offset = src1 & 31;
178 uint32_t width = src2 & 31;
179 return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
180}
181
182__device__ static inline uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) {
183 uint64_t offset = src1 & 63;
184 uint64_t width = src2 & 63;
185 return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
186}
187
188__device__ static inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) {
189 uint32_t offset = src2 & 31;
190 uint32_t width = src3 & 31;
191 uint32_t mask = (1 << width) - 1;
192 return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
193}
194
195__device__ static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) {
196 uint64_t offset = src2 & 63;
197 uint64_t width = src3 & 63;
198 uint64_t mask = (1ULL << width) - 1;
199 return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
200}
201
202__device__ inline unsigned int __funnelshift_l(unsigned int lo, unsigned int hi, unsigned int shift)
203{
204 uint32_t mask_shift = shift & 31;
205 return mask_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - mask_shift);
206}
207
208__device__ inline unsigned int __funnelshift_lc(unsigned int lo, unsigned int hi, unsigned int shift)
209{
210 uint32_t min_shift = shift >= 32 ? 32 : shift;
211 return min_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - min_shift);
212}
213
214__device__ inline unsigned int __funnelshift_r(unsigned int lo, unsigned int hi, unsigned int shift)
215{
216 return __builtin_amdgcn_alignbit(hi, lo, shift);
217}
218
219__device__ inline unsigned int __funnelshift_rc(unsigned int lo, unsigned int hi, unsigned int shift)
220{
221 return shift >= 32 ? hi : __builtin_amdgcn_alignbit(hi, lo, shift);
222}
223
224__device__ static unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s);
225__device__ static unsigned int __hadd(int x, int y);
226__device__ static int __mul24(int x, int y);
227__device__ static long long int __mul64hi(long long int x, long long int y);
228__device__ static int __mulhi(int x, int y);
229__device__ static int __rhadd(int x, int y);
230__device__ static unsigned int __sad(int x, int y,unsigned int z);
231__device__ static unsigned int __uhadd(unsigned int x, unsigned int y);
232__device__ static int __umul24(unsigned int x, unsigned int y);
233__device__ static unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y);
234__device__ static unsigned int __umulhi(unsigned int x, unsigned int y);
235__device__ static unsigned int __urhadd(unsigned int x, unsigned int y);
236__device__ static unsigned int __usad(unsigned int x, unsigned int y, unsigned int z);
237
239 union {
240 unsigned char c[4];
241 unsigned int ui;
242 };
243} __attribute__((aligned(4)));
244
246 union {
247 unsigned int ui[2];
248 unsigned char c[8];
249 };
250} __attribute__((aligned(8)));
251
252__device__
253static inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) {
254 struct uchar2Holder cHoldVal;
255 struct ucharHolder cHoldKey;
256 cHoldKey.ui = s;
257 cHoldVal.ui[0] = x;
258 cHoldVal.ui[1] = y;
259 unsigned int result;
260 result = cHoldVal.c[cHoldKey.c[0] & 0x07];
261 result += (cHoldVal.c[(cHoldKey.c[0] & 0x70) >> 4] << 8);
262 result += (cHoldVal.c[cHoldKey.c[1] & 0x07] << 16);
263 result += (cHoldVal.c[(cHoldKey.c[1] & 0x70) >> 4] << 24);
264 return result;
265}
266
267__device__ static inline unsigned int __hadd(int x, int y) {
268 int z = x + y;
269 int sign = z & 0x8000000;
270 int value = z & 0x7FFFFFFF;
271 return ((value) >> 1 || sign);
272}
273
274__device__ static inline int __mul24(int x, int y) {
275 return __ockl_mul24_i32(x, y);
276}
277
278__device__ static inline long long __mul64hi(long long int x, long long int y) {
279 unsigned long long x0 = (unsigned long long)x & 0xffffffffUL;
280 long long x1 = x >> 32;
281 unsigned long long y0 = (unsigned long long)y & 0xffffffffUL;
282 long long y1 = y >> 32;
283 unsigned long long z0 = x0*y0;
284 long long t = x1*y0 + (z0 >> 32);
285 long long z1 = t & 0xffffffffL;
286 long long z2 = t >> 32;
287 z1 = x0*y1 + z1;
288 return x1*y1 + z2 + (z1 >> 32);
289}
290
291__device__ static inline int __mulhi(int x, int y) {
292 return __ockl_mul_hi_i32(x, y);
293}
294
295__device__ static inline int __rhadd(int x, int y) {
296 int z = x + y + 1;
297 int sign = z & 0x8000000;
298 int value = z & 0x7FFFFFFF;
299 return ((value) >> 1 || sign);
300}
301__device__ static inline unsigned int __sad(int x, int y, unsigned int z) {
302 return x > y ? x - y + z : y - x + z;
303}
304__device__ static inline unsigned int __uhadd(unsigned int x, unsigned int y) {
305 return (x + y) >> 1;
306}
307__device__ static inline int __umul24(unsigned int x, unsigned int y) {
308 return __ockl_mul24_u32(x, y);
309}
310
311__device__
312static inline unsigned long long __umul64hi(unsigned long long int x, unsigned long long int y) {
313 unsigned long long x0 = x & 0xffffffffUL;
314 unsigned long long x1 = x >> 32;
315 unsigned long long y0 = y & 0xffffffffUL;
316 unsigned long long y1 = y >> 32;
317 unsigned long long z0 = x0*y0;
318 unsigned long long t = x1*y0 + (z0 >> 32);
319 unsigned long long z1 = t & 0xffffffffUL;
320 unsigned long long z2 = t >> 32;
321 z1 = x0*y1 + z1;
322 return x1*y1 + z2 + (z1 >> 32);
323}
324
325__device__ static inline unsigned int __umulhi(unsigned int x, unsigned int y) {
326 return __ockl_mul_hi_u32(x, y);
327}
328__device__ static inline unsigned int __urhadd(unsigned int x, unsigned int y) {
329 return (x + y + 1) >> 1;
330}
331__device__ static inline unsigned int __usad(unsigned int x, unsigned int y, unsigned int z) {
332 return __ockl_sadd_u32(x, y, z);
333}
334
335__device__
336static inline unsigned int __mbcnt_lo(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_lo(x,y);};
337
338__device__
339static inline unsigned int __mbcnt_hi(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_hi(x,y);};
340
341/*
342HIP specific device functions
343*/
344
345#if !defined(__HIPCC_RTC__)
346#include "amd_warp_functions.h"
347#include "amd_warp_sync_functions.h"
348#endif
349
350#define MASK1 0x00ff00ff
351#define MASK2 0xff00ff00
352
353__device__ static inline char4 __hip_hc_add8pk(char4 in1, char4 in2) {
354 char4 out;
355 unsigned one1 = in1.w & MASK1;
356 unsigned one2 = in2.w & MASK1;
357 out.w = (one1 + one2) & MASK1;
358 one1 = in1.w & MASK2;
359 one2 = in2.w & MASK2;
360 out.w = out.w | ((one1 + one2) & MASK2);
361 return out;
362}
363
364__device__ static inline char4 __hip_hc_sub8pk(char4 in1, char4 in2) {
365 char4 out;
366 unsigned one1 = in1.w & MASK1;
367 unsigned one2 = in2.w & MASK1;
368 out.w = (one1 - one2) & MASK1;
369 one1 = in1.w & MASK2;
370 one2 = in2.w & MASK2;
371 out.w = out.w | ((one1 - one2) & MASK2);
372 return out;
373}
374
375__device__ static inline char4 __hip_hc_mul8pk(char4 in1, char4 in2) {
376 char4 out;
377 unsigned one1 = in1.w & MASK1;
378 unsigned one2 = in2.w & MASK1;
379 out.w = (one1 * one2) & MASK1;
380 one1 = in1.w & MASK2;
381 one2 = in2.w & MASK2;
382 out.w = out.w | ((one1 * one2) & MASK2);
383 return out;
384}
385
386__device__ static inline float __double2float_rd(double x) {
387 return __ocml_cvtrtn_f32_f64(x);
388}
389__device__ static inline float __double2float_rn(double x) { return x; }
390__device__ static inline float __double2float_ru(double x) {
391 return __ocml_cvtrtp_f32_f64(x);
392}
393__device__ static inline float __double2float_rz(double x) {
394 return __ocml_cvtrtz_f32_f64(x);
395}
396
397__device__ static inline int __double2hiint(double x) {
398 static_assert(sizeof(double) == 2 * sizeof(int), "");
399
400 int tmp[2];
401 __builtin_memcpy(tmp, &x, sizeof(tmp));
402
403 return tmp[1];
404}
405__device__ static inline int __double2loint(double x) {
406 static_assert(sizeof(double) == 2 * sizeof(int), "");
407
408 int tmp[2];
409 __builtin_memcpy(tmp, &x, sizeof(tmp));
410
411 return tmp[0];
412}
413
414__device__ static inline int __double2int_rd(double x) { return (int)__ocml_floor_f64(x); }
415__device__ static inline int __double2int_rn(double x) { return (int)__ocml_rint_f64(x); }
416__device__ static inline int __double2int_ru(double x) { return (int)__ocml_ceil_f64(x); }
417__device__ static inline int __double2int_rz(double x) { return (int)x; }
418
419__device__ static inline long long int __double2ll_rd(double x) {
420 return (long long)__ocml_floor_f64(x);
421}
422__device__ static inline long long int __double2ll_rn(double x) {
423 return (long long)__ocml_rint_f64(x);
424}
425__device__ static inline long long int __double2ll_ru(double x) {
426 return (long long)__ocml_ceil_f64(x);
427}
428__device__ static inline long long int __double2ll_rz(double x) { return (long long)x; }
429
430__device__ static inline unsigned int __double2uint_rd(double x) {
431 return (unsigned int)__ocml_floor_f64(x);
432}
433__device__ static inline unsigned int __double2uint_rn(double x) {
434 return (unsigned int)__ocml_rint_f64(x);
435}
436__device__ static inline unsigned int __double2uint_ru(double x) {
437 return (unsigned int)__ocml_ceil_f64(x);
438}
439__device__ static inline unsigned int __double2uint_rz(double x) { return (unsigned int)x; }
440
441__device__ static inline unsigned long long int __double2ull_rd(double x) {
442 return (unsigned long long int)__ocml_floor_f64(x);
443}
444__device__ static inline unsigned long long int __double2ull_rn(double x) {
445 return (unsigned long long int)__ocml_rint_f64(x);
446}
447__device__ static inline unsigned long long int __double2ull_ru(double x) {
448 return (unsigned long long int)__ocml_ceil_f64(x);
449}
450__device__ static inline unsigned long long int __double2ull_rz(double x) {
451 return (unsigned long long int)x;
452}
453__device__ static inline long long int __double_as_longlong(double x) {
454 static_assert(sizeof(long long) == sizeof(double), "");
455
456 long long tmp;
457 __builtin_memcpy(&tmp, &x, sizeof(tmp));
458
459 return tmp;
460}
461
462/*
463__device__ unsigned short __float2half_rn(float x);
464__device__ float __half2float(unsigned short);
465
466The above device function are not a valid .
467Use
468__device__ __half __float2half_rn(float x);
469__device__ float __half2float(__half);
470from hip_fp16.h
471
472CUDA implements half as unsigned short whereas, HIP doesn't.
473
474*/
475
476__device__ static inline int __float2int_rd(float x) { return (int)__ocml_floor_f32(x); }
477__device__ static inline int __float2int_rn(float x) { return (int)__ocml_rint_f32(x); }
478__device__ static inline int __float2int_ru(float x) { return (int)__ocml_ceil_f32(x); }
479__device__ static inline int __float2int_rz(float x) { return (int)__ocml_trunc_f32(x); }
480
481__device__ static inline long long int __float2ll_rd(float x) {
482 return (long long int)__ocml_floor_f32(x);
483}
484__device__ static inline long long int __float2ll_rn(float x) {
485 return (long long int)__ocml_rint_f32(x);
486}
487__device__ static inline long long int __float2ll_ru(float x) {
488 return (long long int)__ocml_ceil_f32(x);
489}
490__device__ static inline long long int __float2ll_rz(float x) { return (long long int)x; }
491
492__device__ static inline unsigned int __float2uint_rd(float x) {
493 return (unsigned int)__ocml_floor_f32(x);
494}
495__device__ static inline unsigned int __float2uint_rn(float x) {
496 return (unsigned int)__ocml_rint_f32(x);
497}
498__device__ static inline unsigned int __float2uint_ru(float x) {
499 return (unsigned int)__ocml_ceil_f32(x);
500}
501__device__ static inline unsigned int __float2uint_rz(float x) { return (unsigned int)x; }
502
503__device__ static inline unsigned long long int __float2ull_rd(float x) {
504 return (unsigned long long int)__ocml_floor_f32(x);
505}
506__device__ static inline unsigned long long int __float2ull_rn(float x) {
507 return (unsigned long long int)__ocml_rint_f32(x);
508}
509__device__ static inline unsigned long long int __float2ull_ru(float x) {
510 return (unsigned long long int)__ocml_ceil_f32(x);
511}
512__device__ static inline unsigned long long int __float2ull_rz(float x) {
513 return (unsigned long long int)x;
514}
515
516__device__ static inline int __float_as_int(float x) {
517 static_assert(sizeof(int) == sizeof(float), "");
518
519 int tmp;
520 __builtin_memcpy(&tmp, &x, sizeof(tmp));
521
522 return tmp;
523}
524
525__device__ static inline unsigned int __float_as_uint(float x) {
526 static_assert(sizeof(unsigned int) == sizeof(float), "");
527
528 unsigned int tmp;
529 __builtin_memcpy(&tmp, &x, sizeof(tmp));
530
531 return tmp;
532}
533
534__device__ static inline double __hiloint2double(int hi, int lo) {
535 static_assert(sizeof(double) == sizeof(uint64_t), "");
536
537 uint64_t tmp0 = (static_cast<uint64_t>(hi) << 32ull) | static_cast<uint32_t>(lo);
538 double tmp1;
539 __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
540
541 return tmp1;
542}
543
544__device__ static inline double __int2double_rn(int x) { return (double)x; }
545
546__device__ static inline float __int2float_rd(int x) {
547 return __ocml_cvtrtn_f32_s32(x);
548}
549__device__ static inline float __int2float_rn(int x) { return (float)x; }
550__device__ static inline float __int2float_ru(int x) {
551 return __ocml_cvtrtp_f32_s32(x);
552}
553__device__ static inline float __int2float_rz(int x) {
554 return __ocml_cvtrtz_f32_s32(x);
555}
556
557__device__ static inline float __int_as_float(int x) {
558 static_assert(sizeof(float) == sizeof(int), "");
559
560 float tmp;
561 __builtin_memcpy(&tmp, &x, sizeof(tmp));
562
563 return tmp;
564}
565
566__device__ static inline double __ll2double_rd(long long int x) {
567 return __ocml_cvtrtn_f64_s64(x);
568}
569__device__ static inline double __ll2double_rn(long long int x) { return (double)x; }
570__device__ static inline double __ll2double_ru(long long int x) {
571 return __ocml_cvtrtp_f64_s64(x);
572}
573__device__ static inline double __ll2double_rz(long long int x) {
574 return __ocml_cvtrtz_f64_s64(x);
575}
576
577__device__ static inline float __ll2float_rd(long long int x) {
578 return __ocml_cvtrtn_f32_s64(x);
579}
580__device__ static inline float __ll2float_rn(long long int x) { return (float)x; }
581__device__ static inline float __ll2float_ru(long long int x) {
582 return __ocml_cvtrtp_f32_s64(x);
583}
584__device__ static inline float __ll2float_rz(long long int x) {
585 return __ocml_cvtrtz_f32_s64(x);
586}
587
588__device__ static inline double __longlong_as_double(long long int x) {
589 static_assert(sizeof(double) == sizeof(long long), "");
590
591 double tmp;
592 __builtin_memcpy(&tmp, &x, sizeof(tmp));
593
594 return tmp;
595}
596
597__device__ static inline double __uint2double_rn(unsigned int x) { return (double)x; }
598
599__device__ static inline float __uint2float_rd(unsigned int x) {
600 return __ocml_cvtrtn_f32_u32(x);
601}
602__device__ static inline float __uint2float_rn(unsigned int x) { return (float)x; }
603__device__ static inline float __uint2float_ru(unsigned int x) {
604 return __ocml_cvtrtp_f32_u32(x);
605}
606__device__ static inline float __uint2float_rz(unsigned int x) {
607 return __ocml_cvtrtz_f32_u32(x);
608}
609
610__device__ static inline float __uint_as_float(unsigned int x) {
611 static_assert(sizeof(float) == sizeof(unsigned int), "");
612
613 float tmp;
614 __builtin_memcpy(&tmp, &x, sizeof(tmp));
615
616 return tmp;
617}
618
619__device__ static inline double __ull2double_rd(unsigned long long int x) {
620 return __ocml_cvtrtn_f64_u64(x);
621}
622__device__ static inline double __ull2double_rn(unsigned long long int x) { return (double)x; }
623__device__ static inline double __ull2double_ru(unsigned long long int x) {
624 return __ocml_cvtrtp_f64_u64(x);
625}
626__device__ static inline double __ull2double_rz(unsigned long long int x) {
627 return __ocml_cvtrtz_f64_u64(x);
628}
629
630__device__ static inline float __ull2float_rd(unsigned long long int x) {
631 return __ocml_cvtrtn_f32_u64(x);
632}
633__device__ static inline float __ull2float_rn(unsigned long long int x) { return (float)x; }
634__device__ static inline float __ull2float_ru(unsigned long long int x) {
635 return __ocml_cvtrtp_f32_u64(x);
636}
637__device__ static inline float __ull2float_rz(unsigned long long int x) {
638 return __ocml_cvtrtz_f32_u64(x);
639}
640
641#if defined(__clang__) && defined(__HIP__)
642
643// Clock functions
644__device__ long long int __clock64();
645__device__ long long int __clock();
646__device__ long long int clock64();
647__device__ long long int clock();
648__device__ long long int wall_clock64();
649// hip.amdgcn.bc - named sync
650__device__ void __named_sync();
651
652#ifdef __HIP_DEVICE_COMPILE__
653
654// Clock function to return GPU core cycle count.
655// GPU can change its core clock frequency at runtime. The maximum frequency can be queried
656// through hipDeviceAttributeClockRate attribute.
657__device__
658inline __attribute((always_inline))
659long long int __clock64() {
660#if __has_builtin(__builtin_amdgcn_s_memtime)
661 // Exists on gfx8, gfx9, gfx10.1, gfx10.2, gfx10.3
662 return (long long int) __builtin_amdgcn_s_memtime();
663#else
664 // Subject to change when better solution available
665 return (long long int) __builtin_readcyclecounter();
666#endif
667}
668
669__device__
670inline __attribute((always_inline))
671long long int __clock() { return __clock64(); }
672
673// Clock function to return wall clock count at a constant frequency that can be queried
674// through hipDeviceAttributeWallClockRate attribute.
675__device__
676inline __attribute__((always_inline))
677long long int wall_clock64() {
678 return (long long int) __ockl_steadyctr_u64();
679}
680
681__device__
682inline __attribute__((always_inline))
683long long int clock64() { return __clock64(); }
684
685__device__
686inline __attribute__((always_inline))
687long long int clock() { return __clock(); }
688
689// hip.amdgcn.bc - named sync
690__device__
691inline
692void __named_sync() { __builtin_amdgcn_s_barrier(); }
693
694#endif // __HIP_DEVICE_COMPILE__
695
696// hip.amdgcn.bc - lanemask
697__device__
698inline
699uint64_t __lanemask_gt()
700{
701 uint32_t lane = __ockl_lane_u32();
702 if (lane == 63)
703 return 0;
704 uint64_t ballot = __ballot64(1);
705 uint64_t mask = (~((uint64_t)0)) << (lane + 1);
706 return mask & ballot;
707}
708
709__device__
710inline
711uint64_t __lanemask_lt()
712{
713 uint32_t lane = __ockl_lane_u32();
714 int64_t ballot = __ballot64(1);
715 uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
716 return mask & ballot;
717}
718
719__device__
720inline
721uint64_t __lanemask_eq()
722{
723 uint32_t lane = __ockl_lane_u32();
724 int64_t mask = ((uint64_t)1 << lane);
725 return mask;
726}
727
728
729__device__ inline void* __local_to_generic(void* p) { return p; }
730
731#ifdef __HIP_DEVICE_COMPILE__
732__device__
733inline
734void* __get_dynamicgroupbaseptr()
735{
736 // Get group segment base pointer.
737 return (char*)__local_to_generic((void*)__to_local(__builtin_amdgcn_groupstaticsize()));
738}
739#else
740__device__
741void* __get_dynamicgroupbaseptr();
742#endif // __HIP_DEVICE_COMPILE__
743
744__device__
745inline
746void *__amdgcn_get_dynamicgroupbaseptr() {
747 return __get_dynamicgroupbaseptr();
748}
749
750// Memory Fence Functions
751__device__
752inline
753static void __threadfence()
754{
755 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent");
756}
757
758__device__
759inline
760static void __threadfence_block()
761{
762 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
763}
764
765__device__
766inline
767static void __threadfence_system()
768{
769 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "");
770}
771__device__ inline static void __work_group_barrier(__cl_mem_fence_flags flags) {
772 if (flags) {
773 __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
774 __builtin_amdgcn_s_barrier();
775 __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
776 } else {
777 __builtin_amdgcn_s_barrier();
778 }
779}
780
781__device__
782inline
783static void __barrier(int n)
784{
785 __work_group_barrier((__cl_mem_fence_flags)n);
786}
787
788__device__
789inline
790__attribute__((convergent))
791void __syncthreads()
792{
793 __barrier(__CLK_LOCAL_MEM_FENCE);
794}
795
796__device__
797inline
798__attribute__((convergent))
799int __syncthreads_count(int predicate)
800{
801 return __ockl_wgred_add_i32(!!predicate);
802}
803
804__device__
805inline
806__attribute__((convergent))
807int __syncthreads_and(int predicate)
808{
809 return __ockl_wgred_and_i32(!!predicate);
810}
811
812__device__
813inline
814__attribute__((convergent))
815int __syncthreads_or(int predicate)
816{
817 return __ockl_wgred_or_i32(!!predicate);
818}
819
820// hip.amdgcn.bc - device routine
821/*
822 HW_ID Register bit structure for RDNA2 & RDNA3
823 WAVE_ID 4:0 Wave id within the SIMD.
824 SIMD_ID 9:8 SIMD_ID within the WGP: [0] = row, [1] = column.
825 WGP_ID 13:10 Physical WGP ID.
826 SA_ID 16 Shader Array ID
827 SE_ID 20:18 Shader Engine the wave is assigned to for gfx11
828 SE_ID 19:18 Shader Engine the wave is assigned to for gfx10
829 DP_RATE 31:29 Number of double-precision float units per SIMD
830
831 HW_ID Register bit structure for GCN and CDNA
832 WAVE_ID 3:0 Wave buffer slot number. 0-9.
833 SIMD_ID 5:4 SIMD which the wave is assigned to within the CU.
834 PIPE_ID 7:6 Pipeline from which the wave was dispatched.
835 CU_ID 11:8 Compute Unit the wave is assigned to.
836 SH_ID 12 Shader Array (within an SE) the wave is assigned to.
837 SE_ID 15:13 Shader Engine the wave is assigned to for gfx908, gfx90a
838 14:13 Shader Engine the wave is assigned to for gfx940-942
839 TG_ID 19:16 Thread-group ID
840 VM_ID 23:20 Virtual Memory ID
841 QUEUE_ID 26:24 Queue from which this wave was dispatched.
842 STATE_ID 29:27 State ID (graphics only, not compute).
843 ME_ID 31:30 Micro-engine ID.
844
845 XCC_ID Register bit structure for gfx940
846 XCC_ID 3:0 XCC the wave is assigned to.
847 */
848
849#if (defined (__GFX10__) || defined (__GFX11__))
850 #define HW_ID 23
851#else
852 #define HW_ID 4
853#endif
854
855#if (defined(__GFX10__) || defined(__GFX11__))
856 #define HW_ID_WGP_ID_SIZE 4
857 #define HW_ID_WGP_ID_OFFSET 10
858 #if (defined(__AMDGCN_CUMODE__))
859 #define HW_ID_CU_ID_SIZE 1
860 #define HW_ID_CU_ID_OFFSET 8
861 #endif
862#else
863 #define HW_ID_CU_ID_SIZE 4
864 #define HW_ID_CU_ID_OFFSET 8
865#endif
866
867#if (defined(__gfx908__) || defined(__gfx90a__) || \
868 defined(__GFX11__))
869 #define HW_ID_SE_ID_SIZE 3
870#else //4 SEs/XCC for gfx940-942
871 #define HW_ID_SE_ID_SIZE 2
872#endif
873#if (defined(__GFX10__) || defined(__GFX11__))
874 #define HW_ID_SE_ID_OFFSET 18
875 #define HW_ID_SA_ID_OFFSET 16
876 #define HW_ID_SA_ID_SIZE 1
877#else
878 #define HW_ID_SE_ID_OFFSET 13
879#endif
880
881#if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
882 #define XCC_ID 20
883 #define XCC_ID_XCC_ID_SIZE 4
884 #define XCC_ID_XCC_ID_OFFSET 0
885#endif
886
887#if (!defined(__HIP_NO_IMAGE_SUPPORT) && \
888 (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)))
889 #define __HIP_NO_IMAGE_SUPPORT 1
890#endif
891
892/*
893 Encoding of parameter bitmask
894 HW_ID 5:0 HW_ID
895 OFFSET 10:6 Range: 0..31
896 SIZE 15:11 Range: 1..32
897 */
898
899#define GETREG_IMMED(SZ,OFF,REG) (((SZ) << 11) | ((OFF) << 6) | (REG))
900
901/*
902 __smid returns the wave's assigned Compute Unit and Shader Engine.
903 The Compute Unit, CU_ID returned in bits 3:0, and Shader Engine, SE_ID in bits 5:4.
904 Note: the results vary over time.
905 SZ minus 1 since SIZE is 1-based.
906*/
907__device__
908inline
909unsigned __smid(void)
910{
911 unsigned se_id = __builtin_amdgcn_s_getreg(
912 GETREG_IMMED(HW_ID_SE_ID_SIZE-1, HW_ID_SE_ID_OFFSET, HW_ID));
913 #if (defined(__GFX10__) || defined(__GFX11__))
914 unsigned wgp_id = __builtin_amdgcn_s_getreg(
915 GETREG_IMMED(HW_ID_WGP_ID_SIZE - 1, HW_ID_WGP_ID_OFFSET, HW_ID));
916 unsigned sa_id = __builtin_amdgcn_s_getreg(
917 GETREG_IMMED(HW_ID_SA_ID_SIZE - 1, HW_ID_SA_ID_OFFSET, HW_ID));
918 #if (defined(__AMDGCN_CUMODE__))
919 unsigned cu_id = __builtin_amdgcn_s_getreg(
920 GETREG_IMMED(HW_ID_CU_ID_SIZE - 1, HW_ID_CU_ID_OFFSET, HW_ID));
921 #endif
922 #else
923 #if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
924 unsigned xcc_id = __builtin_amdgcn_s_getreg(
925 GETREG_IMMED(XCC_ID_XCC_ID_SIZE - 1, XCC_ID_XCC_ID_OFFSET, XCC_ID));
926 #endif
927 unsigned cu_id = __builtin_amdgcn_s_getreg(
928 GETREG_IMMED(HW_ID_CU_ID_SIZE - 1, HW_ID_CU_ID_OFFSET, HW_ID));
929 #endif
930 #if (defined(__GFX10__) || defined(__GFX11__))
931 unsigned temp = se_id;
932 temp = (temp << HW_ID_SA_ID_SIZE) | sa_id;
933 temp = (temp << HW_ID_WGP_ID_SIZE) | wgp_id;
934 #if (defined(__AMDGCN_CUMODE__))
935 temp = (temp << HW_ID_CU_ID_SIZE) | cu_id;
936 #endif
937 return temp;
938 //TODO : CU Mode impl
939 #elif (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
940 unsigned temp = xcc_id;
941 temp = (temp << HW_ID_SE_ID_SIZE) | se_id;
942 temp = (temp << HW_ID_CU_ID_SIZE) | cu_id;
943 return temp;
944 #else
945 return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
946 #endif
947}
948
953#define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];
954#define HIP_DYNAMIC_SHARED_ATTRIBUTE
955
956#endif //defined(__clang__) && defined(__HIP__)
957
958
959// loop unrolling
960static inline __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size) {
961 auto dstPtr = static_cast<unsigned char*>(dst);
962 auto srcPtr = static_cast<const unsigned char*>(src);
963
964 while (size >= 4u) {
965 dstPtr[0] = srcPtr[0];
966 dstPtr[1] = srcPtr[1];
967 dstPtr[2] = srcPtr[2];
968 dstPtr[3] = srcPtr[3];
969
970 size -= 4u;
971 srcPtr += 4u;
972 dstPtr += 4u;
973 }
974 switch (size) {
975 case 3:
976 dstPtr[2] = srcPtr[2];
977 case 2:
978 dstPtr[1] = srcPtr[1];
979 case 1:
980 dstPtr[0] = srcPtr[0];
981 }
982
983 return dst;
984}
985
986static inline __device__ void* __hip_hc_memset(void* dst, unsigned char val, size_t size) {
987 auto dstPtr = static_cast<unsigned char*>(dst);
988
989 while (size >= 4u) {
990 dstPtr[0] = val;
991 dstPtr[1] = val;
992 dstPtr[2] = val;
993 dstPtr[3] = val;
994
995 size -= 4u;
996 dstPtr += 4u;
997 }
998 switch (size) {
999 case 3:
1000 dstPtr[2] = val;
1001 case 2:
1002 dstPtr[1] = val;
1003 case 1:
1004 dstPtr[0] = val;
1005 }
1006
1007 return dst;
1008}
1009#ifndef __OPENMP_AMDGCN__
1010static inline __device__ void* memcpy(void* dst, const void* src, size_t size) {
1011 return __hip_hc_memcpy(dst, src, size);
1012}
1013
1014static inline __device__ void* memset(void* ptr, int val, size_t size) {
1015 unsigned char val8 = static_cast<unsigned char>(val);
1016 return __hip_hc_memset(ptr, val8, size);
1017}
1018#endif // !__OPENMP_AMDGCN__
1019
1020#endif
Contains declarations for types and functions in device library. Uses int64_t and uint64_t instead of...
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Definition hip_fp16_math_fwd.h:57
Definition amd_device_functions.h:238
Definition amd_device_functions.h:245
Definition amd_hip_vector_types.h:1672