HIP: Heterogenous-computing Interface for Portability
amd_device_functions.h
1 /*
2 Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
3 
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 THE SOFTWARE.
21 */
22 
23 #ifndef HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
24 #define HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
25 
26 #if !defined(__HIPCC_RTC__)
27 #include <hip/amd_detail/amd_hip_common.h>
29 #include <hip/amd_detail/hip_assert.h>
30 #include "host_defines.h"
31 #include "math_fwd.h"
32 #include <hip/hip_runtime_api.h>
33 #include <stddef.h>
34 #include <hip/hip_vector_types.h>
35 #endif // !defined(__HIPCC_RTC__)
36 
37 #if defined(__clang__) && defined(__HIP__)
38 extern "C" __device__ int printf(const char *fmt, ...);
39 #else
40 template <typename... All>
41 static inline __device__ void printf(const char* format, All... all) {}
42 #endif
43 
44 extern "C" __device__ unsigned long long __ockl_steadyctr_u64();
45 
46 /*
47 Integer Intrinsics
48 */
49 
50 // integer intrinsic function __poc __clz __ffs __brev
51 __device__ static inline unsigned int __popc(unsigned int input) {
52  return __builtin_popcount(input);
53 }
54 __device__ static inline unsigned int __popcll(unsigned long long int input) {
55  return __builtin_popcountll(input);
56 }
57 
58 __device__ static inline int __clz(int input) {
59  return __ockl_clz_u32((uint)input);
60 }
61 
62 __device__ static inline int __clzll(long long int input) {
63  return __ockl_clz_u64((uint64_t)input);
64 }
65 
66 __device__ static inline unsigned int __ffs(unsigned int input) {
67  return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
68 }
69 
70 __device__ static inline unsigned int __ffsll(unsigned long long int input) {
71  return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
72 }
73 
74 __device__ static inline unsigned int __ffsll(unsigned long int input) {
75  return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
76 }
77 
78 __device__ static inline unsigned int __ffs(int input) {
79  return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
80 }
81 
82 __device__ static inline unsigned int __ffsll(long long int input) {
83  return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
84 }
85 
86 __device__ static inline unsigned int __ffsll(long int input) {
87  return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
88 }
89 
90 // Given a 32/64-bit value exec mask and an integer value base (between 0 and WAVEFRONT_SIZE),
91 // find the n-th (given by offset) set bit in the exec mask from the base bit, and return the bit position.
92 // If not found, return -1.
93 __device__ static int32_t __fns64(uint64_t mask, uint32_t base, int32_t offset) {
94  uint64_t temp_mask = mask;
95  int32_t temp_offset = offset;
96 
97  if (offset == 0) {
98  temp_mask &= (1 << base);
99  temp_offset = 1;
100  }
101  else if (offset < 0) {
102  temp_mask = __builtin_bitreverse64(mask);
103  base = 63 - base;
104  temp_offset = -offset;
105  }
106 
107  temp_mask = temp_mask & ((~0ULL) << base);
108  if (__builtin_popcountll(temp_mask) < temp_offset)
109  return -1;
110  int32_t total = 0;
111  for (int i = 0x20; i > 0; i >>= 1) {
112  uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1);
113  int32_t pcnt = __builtin_popcountll(temp_mask_lo);
114  if (pcnt < temp_offset) {
115  temp_mask = temp_mask >> i;
116  temp_offset -= pcnt;
117  total += i;
118  }
119  else {
120  temp_mask = temp_mask_lo;
121  }
122  }
123  if (offset < 0)
124  return 63 - total;
125  else
126  return total;
127 }
128 
129 __device__ static int32_t __fns32(uint64_t mask, uint32_t base, int32_t offset) {
130  uint64_t temp_mask = mask;
131  int32_t temp_offset = offset;
132  if (offset == 0) {
133  temp_mask &= (1 << base);
134  temp_offset = 1;
135  }
136  else if (offset < 0) {
137  temp_mask = __builtin_bitreverse64(mask);
138  base = 63 - base;
139  temp_offset = -offset;
140  }
141  temp_mask = temp_mask & ((~0ULL) << base);
142  if (__builtin_popcountll(temp_mask) < temp_offset)
143  return -1;
144  int32_t total = 0;
145  for (int i = 0x20; i > 0; i >>= 1) {
146  uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1);
147  int32_t pcnt = __builtin_popcountll(temp_mask_lo);
148  if (pcnt < temp_offset) {
149  temp_mask = temp_mask >> i;
150  temp_offset -= pcnt;
151  total += i;
152  }
153  else {
154  temp_mask = temp_mask_lo;
155  }
156  }
157  if (offset < 0)
158  return 63 - total;
159  else
160  return total;
161 }
162 __device__ static inline unsigned int __brev(unsigned int input) {
163  return __builtin_bitreverse32(input);
164 }
165 
166 __device__ static inline unsigned long long int __brevll(unsigned long long int input) {
167  return __builtin_bitreverse64(input);
168 }
169 
170 __device__ static inline unsigned int __lastbit_u32_u64(uint64_t input) {
171  return input == 0 ? -1 : __builtin_ctzl(input);
172 }
173 
174 __device__ static inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) {
175  uint32_t offset = src1 & 31;
176  uint32_t width = src2 & 31;
177  return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
178 }
179 
180 __device__ static inline uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) {
181  uint64_t offset = src1 & 63;
182  uint64_t width = src2 & 63;
183  return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
184 }
185 
186 __device__ static inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) {
187  uint32_t offset = src2 & 31;
188  uint32_t width = src3 & 31;
189  uint32_t mask = (1 << width) - 1;
190  return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
191 }
192 
193 __device__ static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) {
194  uint64_t offset = src2 & 63;
195  uint64_t width = src3 & 63;
196  uint64_t mask = (1ULL << width) - 1;
197  return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
198 }
199 
200 __device__ inline unsigned int __funnelshift_l(unsigned int lo, unsigned int hi, unsigned int shift)
201 {
202  uint32_t mask_shift = shift & 31;
203  return mask_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - mask_shift);
204 }
205 
206 __device__ inline unsigned int __funnelshift_lc(unsigned int lo, unsigned int hi, unsigned int shift)
207 {
208  uint32_t min_shift = shift >= 32 ? 32 : shift;
209  return min_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - min_shift);
210 }
211 
212 __device__ inline unsigned int __funnelshift_r(unsigned int lo, unsigned int hi, unsigned int shift)
213 {
214  return __builtin_amdgcn_alignbit(hi, lo, shift);
215 }
216 
217 __device__ inline unsigned int __funnelshift_rc(unsigned int lo, unsigned int hi, unsigned int shift)
218 {
219  return shift >= 32 ? hi : __builtin_amdgcn_alignbit(hi, lo, shift);
220 }
221 
222 __device__ static unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s);
223 __device__ static unsigned int __hadd(int x, int y);
224 __device__ static int __mul24(int x, int y);
225 __device__ static long long int __mul64hi(long long int x, long long int y);
226 __device__ static int __mulhi(int x, int y);
227 __device__ static int __rhadd(int x, int y);
228 __device__ static unsigned int __sad(int x, int y,unsigned int z);
229 __device__ static unsigned int __uhadd(unsigned int x, unsigned int y);
230 __device__ static int __umul24(unsigned int x, unsigned int y);
231 __device__ static unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y);
232 __device__ static unsigned int __umulhi(unsigned int x, unsigned int y);
233 __device__ static unsigned int __urhadd(unsigned int x, unsigned int y);
234 __device__ static unsigned int __usad(unsigned int x, unsigned int y, unsigned int z);
235 
236 struct ucharHolder {
237  union {
238  unsigned char c[4];
239  unsigned int ui;
240  };
241 } __attribute__((aligned(4)));
242 
243 struct uchar2Holder {
244  union {
245  unsigned int ui[2];
246  unsigned char c[8];
247  };
248 } __attribute__((aligned(8)));
249 
250 __device__
251 static inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) {
252  struct uchar2Holder cHoldVal;
253  struct ucharHolder cHoldKey;
254  cHoldKey.ui = s;
255  cHoldVal.ui[0] = x;
256  cHoldVal.ui[1] = y;
257  unsigned int result;
258  result = cHoldVal.c[cHoldKey.c[0] & 0x07];
259  result += (cHoldVal.c[(cHoldKey.c[0] & 0x70) >> 4] << 8);
260  result += (cHoldVal.c[cHoldKey.c[1] & 0x07] << 16);
261  result += (cHoldVal.c[(cHoldKey.c[1] & 0x70) >> 4] << 24);
262  return result;
263 }
264 
265 __device__ static inline unsigned int __hadd(int x, int y) {
266  int z = x + y;
267  int sign = z & 0x8000000;
268  int value = z & 0x7FFFFFFF;
269  return ((value) >> 1 || sign);
270 }
271 
272 __device__ static inline int __mul24(int x, int y) {
273  return __ockl_mul24_i32(x, y);
274 }
275 
276 __device__ static inline long long __mul64hi(long long int x, long long int y) {
277  unsigned long long x0 = (unsigned long long)x & 0xffffffffUL;
278  long long x1 = x >> 32;
279  unsigned long long y0 = (unsigned long long)y & 0xffffffffUL;
280  long long y1 = y >> 32;
281  unsigned long long z0 = x0*y0;
282  long long t = x1*y0 + (z0 >> 32);
283  long long z1 = t & 0xffffffffL;
284  long long z2 = t >> 32;
285  z1 = x0*y1 + z1;
286  return x1*y1 + z2 + (z1 >> 32);
287 }
288 
289 __device__ static inline int __mulhi(int x, int y) {
290  return __ockl_mul_hi_i32(x, y);
291 }
292 
293 __device__ static inline int __rhadd(int x, int y) {
294  int z = x + y + 1;
295  int sign = z & 0x8000000;
296  int value = z & 0x7FFFFFFF;
297  return ((value) >> 1 || sign);
298 }
299 __device__ static inline unsigned int __sad(int x, int y, unsigned int z) {
300  return x > y ? x - y + z : y - x + z;
301 }
302 __device__ static inline unsigned int __uhadd(unsigned int x, unsigned int y) {
303  return (x + y) >> 1;
304 }
305 __device__ static inline int __umul24(unsigned int x, unsigned int y) {
306  return __ockl_mul24_u32(x, y);
307 }
308 
309 __device__
310 static inline unsigned long long __umul64hi(unsigned long long int x, unsigned long long int y) {
311  unsigned long long x0 = x & 0xffffffffUL;
312  unsigned long long x1 = x >> 32;
313  unsigned long long y0 = y & 0xffffffffUL;
314  unsigned long long y1 = y >> 32;
315  unsigned long long z0 = x0*y0;
316  unsigned long long t = x1*y0 + (z0 >> 32);
317  unsigned long long z1 = t & 0xffffffffUL;
318  unsigned long long z2 = t >> 32;
319  z1 = x0*y1 + z1;
320  return x1*y1 + z2 + (z1 >> 32);
321 }
322 
323 __device__ static inline unsigned int __umulhi(unsigned int x, unsigned int y) {
324  return __ockl_mul_hi_u32(x, y);
325 }
326 __device__ static inline unsigned int __urhadd(unsigned int x, unsigned int y) {
327  return (x + y + 1) >> 1;
328 }
329 __device__ static inline unsigned int __usad(unsigned int x, unsigned int y, unsigned int z) {
330  return __ockl_sadd_u32(x, y, z);
331 }
332 
333 __device__
334 static inline unsigned int __mbcnt_lo(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_lo(x,y);};
335 
336 __device__
337 static inline unsigned int __mbcnt_hi(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_hi(x,y);};
338 
339 /*
340 HIP specific device functions
341 */
342 
343 #if !defined(__HIPCC_RTC__)
344 #include "amd_warp_functions.h"
345 #include "amd_warp_sync_functions.h"
346 #endif
347 
348 #define MASK1 0x00ff00ff
349 #define MASK2 0xff00ff00
350 
351 __device__ static inline char4 __hip_hc_add8pk(char4 in1, char4 in2) {
352  char4 out;
353  unsigned one1 = in1.w & MASK1;
354  unsigned one2 = in2.w & MASK1;
355  out.w = (one1 + one2) & MASK1;
356  one1 = in1.w & MASK2;
357  one2 = in2.w & MASK2;
358  out.w = out.w | ((one1 + one2) & MASK2);
359  return out;
360 }
361 
362 __device__ static inline char4 __hip_hc_sub8pk(char4 in1, char4 in2) {
363  char4 out;
364  unsigned one1 = in1.w & MASK1;
365  unsigned one2 = in2.w & MASK1;
366  out.w = (one1 - one2) & MASK1;
367  one1 = in1.w & MASK2;
368  one2 = in2.w & MASK2;
369  out.w = out.w | ((one1 - one2) & MASK2);
370  return out;
371 }
372 
373 __device__ static inline char4 __hip_hc_mul8pk(char4 in1, char4 in2) {
374  char4 out;
375  unsigned one1 = in1.w & MASK1;
376  unsigned one2 = in2.w & MASK1;
377  out.w = (one1 * one2) & MASK1;
378  one1 = in1.w & MASK2;
379  one2 = in2.w & MASK2;
380  out.w = out.w | ((one1 * one2) & MASK2);
381  return out;
382 }
383 
384 __device__ static inline float __double2float_rd(double x) {
385  return __ocml_cvtrtn_f32_f64(x);
386 }
387 __device__ static inline float __double2float_rn(double x) { return x; }
388 __device__ static inline float __double2float_ru(double x) {
389  return __ocml_cvtrtp_f32_f64(x);
390 }
391 __device__ static inline float __double2float_rz(double x) {
392  return __ocml_cvtrtz_f32_f64(x);
393 }
394 
395 __device__ static inline int __double2hiint(double x) {
396  static_assert(sizeof(double) == 2 * sizeof(int), "");
397 
398  int tmp[2];
399  __builtin_memcpy(tmp, &x, sizeof(tmp));
400 
401  return tmp[1];
402 }
403 __device__ static inline int __double2loint(double x) {
404  static_assert(sizeof(double) == 2 * sizeof(int), "");
405 
406  int tmp[2];
407  __builtin_memcpy(tmp, &x, sizeof(tmp));
408 
409  return tmp[0];
410 }
411 
412 __device__ static inline int __double2int_rd(double x) { return (int)__ocml_floor_f64(x); }
413 __device__ static inline int __double2int_rn(double x) { return (int)__ocml_rint_f64(x); }
414 __device__ static inline int __double2int_ru(double x) { return (int)__ocml_ceil_f64(x); }
415 __device__ static inline int __double2int_rz(double x) { return (int)x; }
416 
417 __device__ static inline long long int __double2ll_rd(double x) {
418  return (long long)__ocml_floor_f64(x);
419 }
420 __device__ static inline long long int __double2ll_rn(double x) {
421  return (long long)__ocml_rint_f64(x);
422 }
423 __device__ static inline long long int __double2ll_ru(double x) {
424  return (long long)__ocml_ceil_f64(x);
425 }
426 __device__ static inline long long int __double2ll_rz(double x) { return (long long)x; }
427 
428 __device__ static inline unsigned int __double2uint_rd(double x) {
429  return (unsigned int)__ocml_floor_f64(x);
430 }
431 __device__ static inline unsigned int __double2uint_rn(double x) {
432  return (unsigned int)__ocml_rint_f64(x);
433 }
434 __device__ static inline unsigned int __double2uint_ru(double x) {
435  return (unsigned int)__ocml_ceil_f64(x);
436 }
437 __device__ static inline unsigned int __double2uint_rz(double x) { return (unsigned int)x; }
438 
439 __device__ static inline unsigned long long int __double2ull_rd(double x) {
440  return (unsigned long long int)__ocml_floor_f64(x);
441 }
442 __device__ static inline unsigned long long int __double2ull_rn(double x) {
443  return (unsigned long long int)__ocml_rint_f64(x);
444 }
445 __device__ static inline unsigned long long int __double2ull_ru(double x) {
446  return (unsigned long long int)__ocml_ceil_f64(x);
447 }
448 __device__ static inline unsigned long long int __double2ull_rz(double x) {
449  return (unsigned long long int)x;
450 }
451 __device__ static inline long long int __double_as_longlong(double x) {
452  static_assert(sizeof(long long) == sizeof(double), "");
453 
454  long long tmp;
455  __builtin_memcpy(&tmp, &x, sizeof(tmp));
456 
457  return tmp;
458 }
459 
460 /*
461 __device__ unsigned short __float2half_rn(float x);
462 __device__ float __half2float(unsigned short);
463 
464 The above device function are not a valid .
465 Use
466 __device__ __half __float2half_rn(float x);
467 __device__ float __half2float(__half);
468 from hip_fp16.h
469 
470 CUDA implements half as unsigned short whereas, HIP doesn't.
471 
472 */
473 
474 __device__ static inline int __float2int_rd(float x) { return (int)__ocml_floor_f32(x); }
475 __device__ static inline int __float2int_rn(float x) { return (int)__ocml_rint_f32(x); }
476 __device__ static inline int __float2int_ru(float x) { return (int)__ocml_ceil_f32(x); }
477 __device__ static inline int __float2int_rz(float x) { return (int)__ocml_trunc_f32(x); }
478 
479 __device__ static inline long long int __float2ll_rd(float x) {
480  return (long long int)__ocml_floor_f32(x);
481 }
482 __device__ static inline long long int __float2ll_rn(float x) {
483  return (long long int)__ocml_rint_f32(x);
484 }
485 __device__ static inline long long int __float2ll_ru(float x) {
486  return (long long int)__ocml_ceil_f32(x);
487 }
488 __device__ static inline long long int __float2ll_rz(float x) { return (long long int)x; }
489 
490 __device__ static inline unsigned int __float2uint_rd(float x) {
491  return (unsigned int)__ocml_floor_f32(x);
492 }
493 __device__ static inline unsigned int __float2uint_rn(float x) {
494  return (unsigned int)__ocml_rint_f32(x);
495 }
496 __device__ static inline unsigned int __float2uint_ru(float x) {
497  return (unsigned int)__ocml_ceil_f32(x);
498 }
499 __device__ static inline unsigned int __float2uint_rz(float x) { return (unsigned int)x; }
500 
501 __device__ static inline unsigned long long int __float2ull_rd(float x) {
502  return (unsigned long long int)__ocml_floor_f32(x);
503 }
504 __device__ static inline unsigned long long int __float2ull_rn(float x) {
505  return (unsigned long long int)__ocml_rint_f32(x);
506 }
507 __device__ static inline unsigned long long int __float2ull_ru(float x) {
508  return (unsigned long long int)__ocml_ceil_f32(x);
509 }
510 __device__ static inline unsigned long long int __float2ull_rz(float x) {
511  return (unsigned long long int)x;
512 }
513 
514 __device__ static inline int __float_as_int(float x) {
515  static_assert(sizeof(int) == sizeof(float), "");
516 
517  int tmp;
518  __builtin_memcpy(&tmp, &x, sizeof(tmp));
519 
520  return tmp;
521 }
522 
523 __device__ static inline unsigned int __float_as_uint(float x) {
524  static_assert(sizeof(unsigned int) == sizeof(float), "");
525 
526  unsigned int tmp;
527  __builtin_memcpy(&tmp, &x, sizeof(tmp));
528 
529  return tmp;
530 }
531 
532 __device__ static inline double __hiloint2double(int hi, int lo) {
533  static_assert(sizeof(double) == sizeof(uint64_t), "");
534 
535  uint64_t tmp0 = (static_cast<uint64_t>(hi) << 32ull) | static_cast<uint32_t>(lo);
536  double tmp1;
537  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
538 
539  return tmp1;
540 }
541 
542 __device__ static inline double __int2double_rn(int x) { return (double)x; }
543 
544 __device__ static inline float __int2float_rd(int x) {
545  return __ocml_cvtrtn_f32_s32(x);
546 }
547 __device__ static inline float __int2float_rn(int x) { return (float)x; }
548 __device__ static inline float __int2float_ru(int x) {
549  return __ocml_cvtrtp_f32_s32(x);
550 }
551 __device__ static inline float __int2float_rz(int x) {
552  return __ocml_cvtrtz_f32_s32(x);
553 }
554 
555 __device__ static inline float __int_as_float(int x) {
556  static_assert(sizeof(float) == sizeof(int), "");
557 
558  float tmp;
559  __builtin_memcpy(&tmp, &x, sizeof(tmp));
560 
561  return tmp;
562 }
563 
564 __device__ static inline double __ll2double_rd(long long int x) {
565  return __ocml_cvtrtn_f64_s64(x);
566 }
567 __device__ static inline double __ll2double_rn(long long int x) { return (double)x; }
568 __device__ static inline double __ll2double_ru(long long int x) {
569  return __ocml_cvtrtp_f64_s64(x);
570 }
571 __device__ static inline double __ll2double_rz(long long int x) {
572  return __ocml_cvtrtz_f64_s64(x);
573 }
574 
575 __device__ static inline float __ll2float_rd(long long int x) {
576  return __ocml_cvtrtn_f32_s64(x);
577 }
578 __device__ static inline float __ll2float_rn(long long int x) { return (float)x; }
579 __device__ static inline float __ll2float_ru(long long int x) {
580  return __ocml_cvtrtp_f32_s64(x);
581 }
582 __device__ static inline float __ll2float_rz(long long int x) {
583  return __ocml_cvtrtz_f32_s64(x);
584 }
585 
586 __device__ static inline double __longlong_as_double(long long int x) {
587  static_assert(sizeof(double) == sizeof(long long), "");
588 
589  double tmp;
590  __builtin_memcpy(&tmp, &x, sizeof(tmp));
591 
592  return tmp;
593 }
594 
595 __device__ static inline double __uint2double_rn(unsigned int x) { return (double)x; }
596 
597 __device__ static inline float __uint2float_rd(unsigned int x) {
598  return __ocml_cvtrtn_f32_u32(x);
599 }
600 __device__ static inline float __uint2float_rn(unsigned int x) { return (float)x; }
601 __device__ static inline float __uint2float_ru(unsigned int x) {
602  return __ocml_cvtrtp_f32_u32(x);
603 }
604 __device__ static inline float __uint2float_rz(unsigned int x) {
605  return __ocml_cvtrtz_f32_u32(x);
606 }
607 
608 __device__ static inline float __uint_as_float(unsigned int x) {
609  static_assert(sizeof(float) == sizeof(unsigned int), "");
610 
611  float tmp;
612  __builtin_memcpy(&tmp, &x, sizeof(tmp));
613 
614  return tmp;
615 }
616 
617 __device__ static inline double __ull2double_rd(unsigned long long int x) {
618  return __ocml_cvtrtn_f64_u64(x);
619 }
620 __device__ static inline double __ull2double_rn(unsigned long long int x) { return (double)x; }
621 __device__ static inline double __ull2double_ru(unsigned long long int x) {
622  return __ocml_cvtrtp_f64_u64(x);
623 }
624 __device__ static inline double __ull2double_rz(unsigned long long int x) {
625  return __ocml_cvtrtz_f64_u64(x);
626 }
627 
628 __device__ static inline float __ull2float_rd(unsigned long long int x) {
629  return __ocml_cvtrtn_f32_u64(x);
630 }
631 __device__ static inline float __ull2float_rn(unsigned long long int x) { return (float)x; }
632 __device__ static inline float __ull2float_ru(unsigned long long int x) {
633  return __ocml_cvtrtp_f32_u64(x);
634 }
635 __device__ static inline float __ull2float_rz(unsigned long long int x) {
636  return __ocml_cvtrtz_f32_u64(x);
637 }
638 
639 #if defined(__clang__) && defined(__HIP__)
640 
641 // Clock functions
642 __device__ long long int __clock64();
643 __device__ long long int __clock();
644 __device__ long long int clock64();
645 __device__ long long int clock();
646 __device__ long long int wall_clock64();
647 // hip.amdgcn.bc - named sync
648 __device__ void __named_sync();
649 
650 #ifdef __HIP_DEVICE_COMPILE__
651 
652 // Clock function to return GPU core cycle count.
653 // GPU can change its core clock frequency at runtime. The maximum frequency can be queried
654 // through hipDeviceAttributeClockRate attribute.
655 __device__
656 inline __attribute((always_inline))
657 long long int __clock64() {
658 #if __has_builtin(__builtin_amdgcn_s_memtime)
659  // Exists on gfx8, gfx9, gfx10.1, gfx10.2, gfx10.3
660  return (long long int) __builtin_amdgcn_s_memtime();
661 #else
662  // Subject to change when better solution available
663  return (long long int) __builtin_readcyclecounter();
664 #endif
665 }
666 
667 __device__
668 inline __attribute((always_inline))
669 long long int __clock() { return __clock64(); }
670 
671 // Clock function to return wall clock count at a constant frequency that can be queried
672 // through hipDeviceAttributeWallClockRate attribute.
673 __device__
674 inline __attribute__((always_inline))
675 long long int wall_clock64() {
676  return (long long int) __ockl_steadyctr_u64();
677 }
678 
679 __device__
680 inline __attribute__((always_inline))
681 long long int clock64() { return __clock64(); }
682 
683 __device__
684 inline __attribute__((always_inline))
685 long long int clock() { return __clock(); }
686 
687 // hip.amdgcn.bc - named sync
688 __device__
689 inline
690 void __named_sync() { __builtin_amdgcn_s_barrier(); }
691 
692 #endif // __HIP_DEVICE_COMPILE__
693 
694 // hip.amdgcn.bc - lanemask
695 __device__
696 inline
697 uint64_t __lanemask_gt()
698 {
699  uint32_t lane = __ockl_lane_u32();
700  if (lane == 63)
701  return 0;
702  uint64_t ballot = __ballot64(1);
703  uint64_t mask = (~((uint64_t)0)) << (lane + 1);
704  return mask & ballot;
705 }
706 
707 __device__
708 inline
709 uint64_t __lanemask_lt()
710 {
711  uint32_t lane = __ockl_lane_u32();
712  int64_t ballot = __ballot64(1);
713  uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
714  return mask & ballot;
715 }
716 
717 __device__
718 inline
719 uint64_t __lanemask_eq()
720 {
721  uint32_t lane = __ockl_lane_u32();
722  int64_t mask = ((uint64_t)1 << lane);
723  return mask;
724 }
725 
726 
727 __device__ inline void* __local_to_generic(void* p) { return p; }
728 
729 #ifdef __HIP_DEVICE_COMPILE__
730 __device__
731 inline
732 void* __get_dynamicgroupbaseptr()
733 {
734  // Get group segment base pointer.
735  return (char*)__local_to_generic((void*)__to_local(__builtin_amdgcn_groupstaticsize()));
736 }
737 #else
738 __device__
739 void* __get_dynamicgroupbaseptr();
740 #endif // __HIP_DEVICE_COMPILE__
741 
742 __device__
743 inline
744 void *__amdgcn_get_dynamicgroupbaseptr() {
745  return __get_dynamicgroupbaseptr();
746 }
747 
748 // Memory Fence Functions
749 __device__
750 inline
751 static void __threadfence()
752 {
753  __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent");
754 }
755 
756 __device__
757 inline
758 static void __threadfence_block()
759 {
760  __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
761 }
762 
763 __device__
764 inline
765 static void __threadfence_system()
766 {
767  __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "");
768 }
769 __device__ inline static void __work_group_barrier(__cl_mem_fence_flags flags) {
770  if (flags) {
771  __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
772  __builtin_amdgcn_s_barrier();
773  __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
774  } else {
775  __builtin_amdgcn_s_barrier();
776  }
777 }
778 
779 __device__
780 inline
781 static void __barrier(int n)
782 {
783  __work_group_barrier((__cl_mem_fence_flags)n);
784 }
785 
786 __device__
787 inline
788 __attribute__((convergent))
789 void __syncthreads()
790 {
791  __barrier(__CLK_LOCAL_MEM_FENCE);
792 }
793 
794 __device__
795 inline
796 __attribute__((convergent))
797 int __syncthreads_count(int predicate)
798 {
799  return __ockl_wgred_add_i32(!!predicate);
800 }
801 
802 __device__
803 inline
804 __attribute__((convergent))
805 int __syncthreads_and(int predicate)
806 {
807  return __ockl_wgred_and_i32(!!predicate);
808 }
809 
810 __device__
811 inline
812 __attribute__((convergent))
813 int __syncthreads_or(int predicate)
814 {
815  return __ockl_wgred_or_i32(!!predicate);
816 }
817 
818 // hip.amdgcn.bc - device routine
819 /*
820  HW_ID Register bit structure for RDNA2 & RDNA3
821  WAVE_ID 4:0 Wave id within the SIMD.
822  SIMD_ID 9:8 SIMD_ID within the WGP: [0] = row, [1] = column.
823  WGP_ID 13:10 Physical WGP ID.
824  SA_ID 16 Shader Array ID
825  SE_ID 20:18 Shader Engine the wave is assigned to for gfx11
826  SE_ID 19:18 Shader Engine the wave is assigned to for gfx10
827  DP_RATE 31:29 Number of double-precision float units per SIMD
828 
829  HW_ID Register bit structure for GCN and CDNA
830  WAVE_ID 3:0 Wave buffer slot number. 0-9.
831  SIMD_ID 5:4 SIMD which the wave is assigned to within the CU.
832  PIPE_ID 7:6 Pipeline from which the wave was dispatched.
833  CU_ID 11:8 Compute Unit the wave is assigned to.
834  SH_ID 12 Shader Array (within an SE) the wave is assigned to.
835  SE_ID 15:13 Shader Engine the wave is assigned to for gfx908, gfx90a, gfx940-942
836  14:13 Shader Engine the wave is assigned to for Vega.
837  TG_ID 19:16 Thread-group ID
838  VM_ID 23:20 Virtual Memory ID
839  QUEUE_ID 26:24 Queue from which this wave was dispatched.
840  STATE_ID 29:27 State ID (graphics only, not compute).
841  ME_ID 31:30 Micro-engine ID.
842 
843  XCC_ID Register bit structure for gfx940
844  XCC_ID 3:0 XCC the wave is assigned to.
845  */
846 
847 #if (defined (__GFX10__) || defined (__GFX11__))
848  #define HW_ID 23
849 #else
850  #define HW_ID 4
851 #endif
852 
853 #if (defined(__GFX10__) || defined(__GFX11__))
854  #define HW_ID_WGP_ID_SIZE 4
855  #define HW_ID_WGP_ID_OFFSET 10
856  #if (defined(__AMDGCN_CUMODE__))
857  #define HW_ID_CU_ID_SIZE 1
858  #define HW_ID_CU_ID_OFFSET 8
859  #endif
860 #else
861  #define HW_ID_CU_ID_SIZE 4
862  #define HW_ID_CU_ID_OFFSET 8
863 #endif
864 
865 #if (defined(__gfx908__) || defined(__gfx90a__) || \
866  defined(__GFX11__))
867  #define HW_ID_SE_ID_SIZE 3
868 #else //4 SEs/XCC for gfx940-942
869  #define HW_ID_SE_ID_SIZE 2
870 #endif
871 #if (defined(__GFX10__) || defined(__GFX11__))
872  #define HW_ID_SE_ID_OFFSET 18
873  #define HW_ID_SA_ID_OFFSET 16
874  #define HW_ID_SA_ID_SIZE 1
875 #else
876  #define HW_ID_SE_ID_OFFSET 13
877 #endif
878 
879 #if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
880  #define XCC_ID 20
881  #define XCC_ID_XCC_ID_SIZE 4
882  #define XCC_ID_XCC_ID_OFFSET 0
883 #endif
884 
885 #if (!defined(__HIP_NO_IMAGE_SUPPORT) && \
886  (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)))
887  #define __HIP_NO_IMAGE_SUPPORT 1
888 #endif
889 
890 /*
891  Encoding of parameter bitmask
892  HW_ID 5:0 HW_ID
893  OFFSET 10:6 Range: 0..31
894  SIZE 15:11 Range: 1..32
895  */
896 
897 #define GETREG_IMMED(SZ,OFF,REG) (((SZ) << 11) | ((OFF) << 6) | (REG))
898 
899 /*
900  __smid returns the wave's assigned Compute Unit and Shader Engine.
901  The Compute Unit, CU_ID returned in bits 3:0, and Shader Engine, SE_ID in bits 5:4.
902  Note: the results vary over time.
903  SZ minus 1 since SIZE is 1-based.
904 */
905 __device__
906 inline
907 unsigned __smid(void)
908 {
909  unsigned se_id = __builtin_amdgcn_s_getreg(
910  GETREG_IMMED(HW_ID_SE_ID_SIZE-1, HW_ID_SE_ID_OFFSET, HW_ID));
911  #if (defined(__GFX10__) || defined(__GFX11__))
912  unsigned wgp_id = __builtin_amdgcn_s_getreg(
913  GETREG_IMMED(HW_ID_WGP_ID_SIZE - 1, HW_ID_WGP_ID_OFFSET, HW_ID));
914  unsigned sa_id = __builtin_amdgcn_s_getreg(
915  GETREG_IMMED(HW_ID_SA_ID_SIZE - 1, HW_ID_SA_ID_OFFSET, HW_ID));
916  #if (defined(__AMDGCN_CUMODE__))
917  unsigned cu_id = __builtin_amdgcn_s_getreg(
918  GETREG_IMMED(HW_ID_CU_ID_SIZE - 1, HW_ID_CU_ID_OFFSET, HW_ID));
919  #endif
920  #else
921  #if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
922  unsigned xcc_id = __builtin_amdgcn_s_getreg(
923  GETREG_IMMED(XCC_ID_XCC_ID_SIZE - 1, XCC_ID_XCC_ID_OFFSET, XCC_ID));
924  #endif
925  unsigned cu_id = __builtin_amdgcn_s_getreg(
926  GETREG_IMMED(HW_ID_CU_ID_SIZE - 1, HW_ID_CU_ID_OFFSET, HW_ID));
927  #endif
928  #if (defined(__GFX10__) || defined(__GFX11__))
929  unsigned temp = se_id;
930  temp = (temp << HW_ID_SA_ID_SIZE) | sa_id;
931  temp = (temp << HW_ID_WGP_ID_SIZE) | wgp_id;
932  #if (defined(__AMDGCN_CUMODE__))
933  temp = (temp << HW_ID_CU_ID_SIZE) | cu_id;
934  #endif
935  return temp;
936  //TODO : CU Mode impl
937  #elif (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
938  unsigned temp = xcc_id;
939  temp = (temp << HW_ID_SE_ID_SIZE) | se_id;
940  temp = (temp << HW_ID_CU_ID_SIZE) | cu_id;
941  return temp;
942  #else
943  return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
944  #endif
945 }
946 
951 #define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];
952 #define HIP_DYNAMIC_SHARED_ATTRIBUTE
953 
954 #endif //defined(__clang__) && defined(__HIP__)
955 
956 
957 // loop unrolling
958 static inline __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size) {
959  auto dstPtr = static_cast<unsigned char*>(dst);
960  auto srcPtr = static_cast<const unsigned char*>(src);
961 
962  while (size >= 4u) {
963  dstPtr[0] = srcPtr[0];
964  dstPtr[1] = srcPtr[1];
965  dstPtr[2] = srcPtr[2];
966  dstPtr[3] = srcPtr[3];
967 
968  size -= 4u;
969  srcPtr += 4u;
970  dstPtr += 4u;
971  }
972  switch (size) {
973  case 3:
974  dstPtr[2] = srcPtr[2];
975  case 2:
976  dstPtr[1] = srcPtr[1];
977  case 1:
978  dstPtr[0] = srcPtr[0];
979  }
980 
981  return dst;
982 }
983 
984 static inline __device__ void* __hip_hc_memset(void* dst, unsigned char val, size_t size) {
985  auto dstPtr = static_cast<unsigned char*>(dst);
986 
987  while (size >= 4u) {
988  dstPtr[0] = val;
989  dstPtr[1] = val;
990  dstPtr[2] = val;
991  dstPtr[3] = val;
992 
993  size -= 4u;
994  dstPtr += 4u;
995  }
996  switch (size) {
997  case 3:
998  dstPtr[2] = val;
999  case 2:
1000  dstPtr[1] = val;
1001  case 1:
1002  dstPtr[0] = val;
1003  }
1004 
1005  return dst;
1006 }
1007 #ifndef __OPENMP_AMDGCN__
1008 static inline __device__ void* memcpy(void* dst, const void* src, size_t size) {
1009  return __hip_hc_memcpy(dst, src, size);
1010 }
1011 
1012 static inline __device__ void* memset(void* ptr, int val, size_t size) {
1013  unsigned char val8 = static_cast<unsigned char>(val);
1014  return __hip_hc_memset(ptr, val8, size);
1015 }
1016 #endif // !__OPENMP_AMDGCN__
1017 
1018 #endif
Contains declarations for types and functions in device library. Uses int64_t and uint64_t instead of...
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hadd(const __hip_bfloat16 a, const __hip_bfloat16 b)
Adds two bfloat16 values.
Definition: amd_hip_bf16.h:673
struct ucharHolder __attribute__((aligned(4)))
represents raw bfloat16x2 vector type
Definition: amd_hip_bf16.h:161
Definition: amd_device_functions.h:236
Definition: amd_device_functions.h:243
Definition: amd_hip_vector_types.h:1672