HIP: Heterogenous-computing Interface for Portability
Loading...
Searching...
No Matches
amd_hip_bf16.h
Go to the documentation of this file.
1
108#ifndef _HIP_INCLUDE_HIP_AMD_DETAIL_HIP_BF16_H_
109#define _HIP_INCLUDE_HIP_AMD_DETAIL_HIP_BF16_H_
110
111#if !defined(__HIPCC_RTC__)
112#include <hip/amd_detail/amd_hip_common.h>
113#include <hip/amd_detail/amd_warp_functions.h> // Sync functions
114#endif // !defined(__HIPCC_RTC__)
115
116#include "amd_hip_vector_types.h" // float2 etc
117#include "device_library_decls.h" // ocml conversion functions
118#if defined(__clang__) && defined(__HIP__)
119#include "amd_hip_atomic.h"
120#endif // defined(__clang__) && defined(__HIP__)
121#include "math_fwd.h" // ocml device functions
122
123#define __BF16_DEVICE__ __device__
124#if defined(__HIPCC_RTC__)
125#define __BF16_HOST_DEVICE__ __BF16_DEVICE__
126#else
127#include <algorithm>
128#include <climits>
129#include <cmath>
130#define __BF16_HOST_DEVICE__ __host__ __BF16_DEVICE__
131#endif
132#define __BF16_DEVICE_STATIC__ __BF16_DEVICE__ static inline
133#define __BF16_HOST_DEVICE_STATIC__ __BF16_HOST_DEVICE__ static inline
134
135#if defined(__AVX512VL__) and defined(__AVX512BF16__) and not defined(__HIP_DEVICE_COMPILE__)
136// Enable with -mavx512vl -mavx512bf16
137#if defined(__MINGW64__)
138#include <intrin.h>
139#else
140#include <immintrin.h>
141#endif
142#define HIP_BF16_AVX512_OP 1
143static_assert(sizeof(__bf16) == sizeof(unsigned short),
144 "sizeof __bf16 should match sizeof unsigned short");
145#else
146#define HIP_BF16_AVX512_OP 0
147#endif
148
149#define HIPRT_ONE_BF16 __ushort_as_bfloat16((unsigned short)0x3F80U)
150#define HIPRT_ZERO_BF16 __ushort_as_bfloat16((unsigned short)0x0000U)
151#define HIPRT_INF_BF16 __ushort_as_bfloat16((unsigned short)0x7F80U)
152#define HIPRT_MAX_NORMAL_BF16 __ushort_as_bfloat16((unsigned short)0x7F7FU)
153#define HIPRT_MIN_DENORM_BF16 __ushort_as_bfloat16((unsigned short)0x0001U)
154#define HIPRT_NAN_BF16 __ushort_as_bfloat16((unsigned short)0x7FFFU)
155#define HIPRT_NEG_ZERO_BF16 __ushort_as_bfloat16((unsigned short)0x8000U)
156
157// Since we are using unsigned short to represent data in bfloat16, it can be of different sizes on
158// different machines. These naive checks should prevent some undefined behavior on systems which
159// have different sizes for basic types.
160#if !defined(__HIPCC_RTC__)
161static_assert(CHAR_BIT == 8, "byte size should be of 8 bits");
162#endif
163static_assert(sizeof(unsigned short) == 2, "size of unsigned short should be 2 bytes");
164
169typedef struct __attribute__((aligned(2))) {
170 unsigned short x;
171} __hip_bfloat16_raw;
172
177typedef struct __attribute__((aligned(4))) {
178 unsigned short x;
179 unsigned short y;
180} __hip_bfloat162_raw;
181
188struct __attribute__((aligned(2))) __hip_bfloat16 {
189 private:
190 __BF16_HOST_DEVICE_STATIC__ float bfloatraw_2_float(unsigned short val) {
191#if HIP_BF16_AVX512_OP
192 union {
193 unsigned short us;
194 __bf16 bf16;
195 } u = {val};
196 return _mm_cvtsbh_ss(u.bf16);
197#else
198 unsigned int uval = val << 16;
199 union {
200 unsigned int u32;
201 float fp32;
202 } u = {uval};
203 return u.fp32;
204#endif
205 }
206
207 __BF16_HOST_DEVICE_STATIC__ unsigned short float_2_bfloatraw(float f) {
208#if HIP_BF16_AVX512_OP
209 union {
210 __bf16 bf16;
211 unsigned short us;
212 } u = {_mm_cvtness_sbh(f)};
213 return u.us;
214#else
215 union {
216 float fp32;
217 unsigned int u32;
218 } u = {f};
219 if (~u.u32 & 0x7f800000) {
220 // When the exponent bits are not all 1s, then the value is zero, normal,
221 // or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus
222 // 1 if the least significant bit of the bfloat16 mantissa is 1 (odd).
223 // This causes the bfloat16's mantissa to be incremented by 1 if the 16
224 // least significant bits of the float mantissa are greater than 0x8000,
225 // or if they are equal to 0x8000 and the least significant bit of the
226 // bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when
227 // the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already
228 // has the value 0x7f, then incrementing it causes it to become 0x00 and
229 // the exponent is incremented by one, which is the next higher FP value
230 // to the unrounded bfloat16 value. When the bfloat16 value is subnormal
231 // with an exponent of 0x00 and a mantissa of 0x7F, it may be rounded up
232 // to a normal value with an exponent of 0x01 and a mantissa of 0x00.
233 // When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F,
234 // incrementing it causes it to become an exponent of 0xFF and a mantissa
235 // of 0x00, which is Inf, the next higher value to the unrounded value.
236 u.u32 += 0x7fff + ((u.u32 >> 16) & 1); // Round to nearest, round to even
237 } else if (u.u32 & 0xffff) {
238 // When all of the exponent bits are 1, the value is Inf or NaN.
239 // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
240 // mantissa bit. Quiet NaN is indicated by the most significant mantissa
241 // bit being 1. Signaling NaN is indicated by the most significant
242 // mantissa bit being 0 but some other bit(s) being 1. If any of the
243 // lower 16 bits of the mantissa are 1, we set the least significant bit
244 // of the bfloat16 mantissa, in order to preserve signaling NaN in case
245 // the bloat16's mantissa bits are all 0.
246 u.u32 |= 0x10000; // Preserve signaling NaN
247 }
248 return static_cast<unsigned short>(u.u32 >> 16);
249#endif
250 }
251
252 __BF16_HOST_DEVICE_STATIC__ unsigned short double_2_bfloatraw(double d_in) {
253 union {
254 float fp32;
255 unsigned int u32;
256 } u = {static_cast<float>(d_in)};
257 double d = u.fp32;
258
259 // Round to odd
260 if ((d_in > 0.0 && d > d_in) || (d_in < 0.0 && d < d_in)) {
261 u.u32--;
262 u.u32 |= 1;
263 }
264
265 return float_2_bfloatraw(u.fp32);
266 }
267
268 protected:
270 unsigned short __x;
271
272 public:
273 // TODO: SWDEV-452411
274 // Need to add constructor of __hip_bfloat16 from
275 // unsigned long long
276 // long long
277 // long
278 // unsigned long
279 // Casting directly to double might lead to double rounding.
280
282 __BF16_HOST_DEVICE__ __hip_bfloat16(unsigned int val)
283 : __x(double_2_bfloatraw(static_cast<double>(val))) {}
284
286 __BF16_HOST_DEVICE__ __hip_bfloat16(int val)
287 : __x(double_2_bfloatraw(static_cast<double>(val))) {}
288
290 __BF16_HOST_DEVICE__ __hip_bfloat16(unsigned short val)
291 : __x(float_2_bfloatraw(static_cast<float>(val))) {}
292
294 __BF16_HOST_DEVICE__ __hip_bfloat16(short val)
295 : __x(float_2_bfloatraw(static_cast<float>(val))) {}
296
298 __BF16_HOST_DEVICE__ __hip_bfloat16(const double val) : __x(double_2_bfloatraw(val)) {}
299
301 __BF16_HOST_DEVICE__ __hip_bfloat16(const float val) : __x(float_2_bfloatraw(val)) {}
302
304 __BF16_HOST_DEVICE__ __hip_bfloat16(const __hip_bfloat16_raw& val) : __x(val.x) {}
305
307 __BF16_HOST_DEVICE__ __hip_bfloat16() = default;
308
310 __BF16_HOST_DEVICE__ operator __hip_bfloat16_raw() const { return __hip_bfloat16_raw{__x}; }
311
313 __BF16_HOST_DEVICE__ operator __hip_bfloat16_raw() const volatile {
314 return __hip_bfloat16_raw{__x};
315 }
316
318 __BF16_HOST_DEVICE__ operator bool() const {
319 auto val = bfloatraw_2_float(__x);
320 return val != 0.0f && val != -0.0f;
321 }
322
324 __BF16_HOST_DEVICE__ operator char() const { return static_cast<char>(bfloatraw_2_float(__x)); }
325
327 __BF16_HOST_DEVICE__ operator float() const { return bfloatraw_2_float(__x); }
328
330 __BF16_HOST_DEVICE__ operator int() const { return static_cast<int>(bfloatraw_2_float(__x)); }
331
333 __BF16_HOST_DEVICE__ operator long() const { return static_cast<long>(bfloatraw_2_float(__x)); }
334
336 __BF16_HOST_DEVICE__ operator long long() const {
337 return static_cast<long long>(bfloatraw_2_float(__x));
338 }
339
341 __BF16_HOST_DEVICE__ operator short() const { return static_cast<short>(bfloatraw_2_float(__x)); }
342
344 __BF16_HOST_DEVICE__ operator signed char() const {
345 return static_cast<signed char>(bfloatraw_2_float(__x));
346 }
347
349 __BF16_HOST_DEVICE__ operator unsigned char() const {
350 return static_cast<unsigned char>(bfloatraw_2_float(__x));
351 }
352
354 __BF16_HOST_DEVICE__ operator unsigned int() const {
355 return static_cast<unsigned int>(bfloatraw_2_float(__x));
356 }
357
359 __BF16_HOST_DEVICE__ operator unsigned long() const {
360 return static_cast<unsigned long>(bfloatraw_2_float(__x));
361 }
362
364 __BF16_HOST_DEVICE__ operator unsigned long long() const {
365 return static_cast<unsigned long long>(bfloatraw_2_float(__x));
366 }
367
369 __BF16_HOST_DEVICE__ operator unsigned short() const {
370 return static_cast<unsigned short>(bfloatraw_2_float(__x));
371 }
372
373 // TODO: SWDEV-452411 add operator which converts unsigned long long and long long to bfloat
374
376 __BF16_HOST_DEVICE__ __hip_bfloat16& operator=(unsigned int val) {
377 __x = float_2_bfloatraw(static_cast<float>(val));
378 return *this;
379 }
380
382 __BF16_HOST_DEVICE__ __hip_bfloat16& operator=(int val) {
383 __x = float_2_bfloatraw(static_cast<float>(val));
384 return *this;
385 }
386
388 __BF16_HOST_DEVICE__ __hip_bfloat16& operator=(unsigned short val) {
389 __x = float_2_bfloatraw(static_cast<float>(val));
390 return *this;
391 }
392
394 __BF16_HOST_DEVICE__ __hip_bfloat16& operator=(short val) {
395 __x = float_2_bfloatraw(static_cast<float>(val));
396 return *this;
397 }
398
400 __BF16_HOST_DEVICE__ __hip_bfloat16& operator=(const double f) {
401 __x = float_2_bfloatraw(static_cast<float>(f));
402 return *this;
403 }
404
406 __BF16_HOST_DEVICE__ __hip_bfloat16& operator=(const float f) {
407 __x = float_2_bfloatraw(f);
408 return *this;
409 }
410
412 __BF16_HOST_DEVICE__ __hip_bfloat16& operator=(const __hip_bfloat16_raw& hr) {
413 __x = hr.x;
414 return *this;
415 }
416
418 __BF16_HOST_DEVICE__ volatile __hip_bfloat16& operator=(const __hip_bfloat16_raw& hr) volatile {
419 __x = hr.x;
420 return *this;
421 }
422
424 __BF16_HOST_DEVICE__ volatile __hip_bfloat16& operator=(
425 const volatile __hip_bfloat16_raw& hr) volatile {
426 __x = hr.x;
427 return *this;
428 }
429};
438struct __attribute__((aligned(4))) __hip_bfloat162 {
439 public:
440 __hip_bfloat16 x;
441 __hip_bfloat16 y;
444 public:
446 __BF16_HOST_DEVICE__ __hip_bfloat162(const __hip_bfloat162_raw& h2r)
447 : x(__hip_bfloat16(__hip_bfloat16_raw{h2r.x})),
448 y(__hip_bfloat16(__hip_bfloat16_raw{h2r.y})) {}
449
451 __BF16_HOST_DEVICE__ __hip_bfloat162(const __hip_bfloat162& val) : x(val.x), y(val.y) {}
452
454 __BF16_HOST_DEVICE__ __hip_bfloat162(const __hip_bfloat16& a, const __hip_bfloat16& b)
455 : x(a), y(b) {}
456
458 __BF16_HOST_DEVICE__ __hip_bfloat162() = default;
459
461 __BF16_HOST_DEVICE__ operator __hip_bfloat162_raw() const {
462 __hip_bfloat16_raw l = x;
463 __hip_bfloat16_raw r = y;
464 return __hip_bfloat162_raw{l.x, r.x};
465 }
466
468 __BF16_HOST_DEVICE__ operator float2() const {
469#if HIP_BF16_AVX512_OP
470 union {
471 __hip_bfloat162_raw raw2;
472 __bf16 bf162[2];
473 static_assert(sizeof(__bf16[2]) == sizeof(__hip_bfloat162_raw));
474 } u;
475 u.raw2 = *this;
476 __m128bh pbf16{u.bf162[0], u.bf162[1], 0, 0};
477 __m128 pf32 = _mm_cvtpbh_ps(pbf16);
478 float2 ret(pf32[0], pf32[1]);
479#else
480 float2 ret(x, y);
481#endif
482 return ret;
483 }
484
486 __BF16_HOST_DEVICE__ __hip_bfloat162& operator=(const __hip_bfloat162_raw& h2r) {
487 x = __hip_bfloat16(__hip_bfloat16_raw{h2r.x});
488 y = __hip_bfloat16(__hip_bfloat16_raw{h2r.y});
489 return *this;
490 }
491
493 __BF16_HOST_DEVICE__ __hip_bfloat162& operator=(const __hip_bfloat162& src) {
494 x = src.x;
495 y = src.y;
496 return *this;
497 }
498};
505__BF16_HOST_DEVICE_STATIC__ float __bfloat162float(__hip_bfloat16 a) {
506 float ret = a;
507 return ret;
508}
509
514__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __float2bfloat16(float f) {
515 __hip_bfloat16 ret{f};
516 return ret;
517}
518
523__BF16_HOST_DEVICE_STATIC__ float2 __bfloat1622float2(const __hip_bfloat162 a) {
524 float2 ret = a;
525 return ret;
526}
527
532__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __bfloat162bfloat162(const __hip_bfloat16 a) {
533 return __hip_bfloat162(a, a);
534}
535
540__BF16_HOST_DEVICE_STATIC__ short int __bfloat16_as_short(const __hip_bfloat16 h) {
541 static_assert(sizeof(__hip_bfloat16) == sizeof(short int));
542 union {
543 __hip_bfloat16 bf16;
544 short int si;
545 } u{h};
546 return u.si;
547}
548
553__BF16_HOST_DEVICE_STATIC__ unsigned short int __bfloat16_as_ushort(const __hip_bfloat16 h) {
554 static_assert(sizeof(__hip_bfloat16) == sizeof(unsigned short int));
555 union {
556 __hip_bfloat16 bf16;
557 unsigned short int usi;
558 } u{h};
559 return u.usi;
560}
561
566__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __double2bfloat16(const double a) {
567 __hip_bfloat16 ret{a};
568 return ret;
569}
570
575__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __float22bfloat162_rn(const float2 a) {
576 return __hip_bfloat162{__float2bfloat16(a.x), __float2bfloat16(a.y)};
577}
578
583__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __halves2bfloat162(const __hip_bfloat16 a,
584 const __hip_bfloat16 b) {
585 return __hip_bfloat162(a, b);
586}
587
592__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __high2bfloat16(const __hip_bfloat162 a) { return a.y; }
593
598__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __high2bfloat162(const __hip_bfloat162 a) {
599 return __hip_bfloat162(a.y, a.y);
600}
601
606__BF16_HOST_DEVICE_STATIC__ float __high2float(const __hip_bfloat162 a) {
607 return __bfloat162float(a.y);
608}
609
614__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __highs2bfloat162(const __hip_bfloat162 a,
615 const __hip_bfloat162 b) {
616 return __hip_bfloat162(a.y, b.y);
617}
618
623__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __low2bfloat16(const __hip_bfloat162 a) { return a.x; }
624
629__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __low2bfloat162(const __hip_bfloat162 a) {
630 return __hip_bfloat162(a.x, a.x);
631}
632
637__BF16_HOST_DEVICE_STATIC__ float __low2float(const __hip_bfloat162 a) {
638 return __bfloat162float(a.x);
639}
640
645__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __lowhigh2highlow(const __hip_bfloat162 a) {
646 return __hip_bfloat162(a.y, a.x);
647}
648
653__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __lows2bfloat162(const __hip_bfloat162 a,
654 const __hip_bfloat162 b) {
655 return __hip_bfloat162(a.x, b.x);
656}
657
662__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __short_as_bfloat16(const short int a) {
663 static_assert(sizeof(__hip_bfloat16) == sizeof(short int));
664 union {
665 short int si;
666 __hip_bfloat16 bf16;
667 } u{a};
668 return u.bf16;
669}
670
675__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __ushort_as_bfloat16(const unsigned short int a) {
676 static_assert(sizeof(__hip_bfloat16) == sizeof(unsigned short int));
677 union {
678 unsigned short int usi;
679 __hip_bfloat16 bf16;
680 } u{a};
681 return u.bf16;
682}
683
684#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
689__BF16_DEVICE_STATIC__ __hip_bfloat16 __shfl_down_sync(const unsigned long long mask,
690 const __hip_bfloat16 in,
691 const unsigned int delta,
692 const int width = __AMDGCN_WAVEFRONT_SIZE) {
693 return __ushort_as_bfloat16(__shfl_down_sync(mask, __bfloat16_as_ushort(in), delta, width));
694}
695
700__BF16_DEVICE_STATIC__ __hip_bfloat162 __shfl_down_sync(const unsigned long long mask,
701 const __hip_bfloat162 in,
702 const unsigned int delta,
703 const int width = __AMDGCN_WAVEFRONT_SIZE) {
704 static_assert(sizeof(__hip_bfloat162) == sizeof(unsigned int));
705 union {
706 __hip_bfloat162 bf162;
707 unsigned int ui;
708 } u{in};
709 u.ui = __shfl_down_sync<unsigned long long, unsigned int>(mask, u.ui, delta, width);
710 return u.bf162;
711}
712
717__BF16_DEVICE_STATIC__ __hip_bfloat16 __shfl_sync(const unsigned long long mask,
718 const __hip_bfloat16 in, const int delta,
719 const int width = __AMDGCN_WAVEFRONT_SIZE) {
720 return __ushort_as_bfloat16(__shfl_sync(mask, __bfloat16_as_ushort(in), delta, width));
721}
722
727__BF16_DEVICE_STATIC__ __hip_bfloat162 __shfl_sync(const unsigned long long mask,
728 const __hip_bfloat162 in, const int delta,
729 const int width = __AMDGCN_WAVEFRONT_SIZE) {
730 static_assert(sizeof(__hip_bfloat162) == sizeof(unsigned int));
731 union {
732 __hip_bfloat162 bf162;
733 unsigned int ui;
734 } u{in};
735 u.ui = __shfl_sync(mask, u.ui, delta, width);
736 return u.bf162;
737}
738
743__BF16_DEVICE_STATIC__ __hip_bfloat16 __shfl_up_sync(const unsigned long long mask,
744 const __hip_bfloat16 in,
745 const unsigned int delta,
746 const int width = __AMDGCN_WAVEFRONT_SIZE) {
747 return __ushort_as_bfloat16(__shfl_up_sync(mask, __bfloat16_as_ushort(in), delta, width));
748}
749
754__BF16_DEVICE_STATIC__ __hip_bfloat162 __shfl_up_sync(const unsigned long long mask,
755 const __hip_bfloat162 in,
756 const unsigned int delta,
757 const int width = __AMDGCN_WAVEFRONT_SIZE) {
758 static_assert(sizeof(__hip_bfloat162) == sizeof(unsigned int));
759 union {
760 __hip_bfloat162 bf162;
761 unsigned int ui;
762 } u{in};
763 u.ui = __shfl_up_sync(mask, u.ui, delta, width);
764 return u.bf162;
765}
766
771__BF16_DEVICE_STATIC__ __hip_bfloat16 __shfl_xor_sync(const unsigned long long mask,
772 const __hip_bfloat16 in, const int delta,
773 const int width = __AMDGCN_WAVEFRONT_SIZE) {
774 return __ushort_as_bfloat16(__shfl_xor_sync(mask, __bfloat16_as_ushort(in), delta, width));
775}
776
781__BF16_DEVICE_STATIC__ __hip_bfloat162 __shfl_xor_sync(const unsigned long long mask,
782 const __hip_bfloat162 in, const int delta,
783 const int width = __AMDGCN_WAVEFRONT_SIZE) {
784 static_assert(sizeof(__hip_bfloat162) == sizeof(unsigned int));
785 union {
786 __hip_bfloat162 bf162;
787 unsigned int ui;
788 } u{in};
789 u.ui = __shfl_xor_sync(mask, u.ui, delta, width);
790 return u.bf162;
791}
792#endif
793
798__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hadd(const __hip_bfloat16 a, const __hip_bfloat16 b) {
800}
801
806__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hsub(const __hip_bfloat16 a, const __hip_bfloat16 b) {
808}
809
814__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hdiv(const __hip_bfloat16 a, const __hip_bfloat16 b) {
816}
817
822__BF16_DEVICE_STATIC__ __hip_bfloat16 __hfma(const __hip_bfloat16 a, const __hip_bfloat16 b,
823 const __hip_bfloat16 c) {
824 return __float2bfloat16(
825 __ocml_fma_f32(__bfloat162float(a), __bfloat162float(b), __bfloat162float(c)));
826}
827
832__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hmul(const __hip_bfloat16 a, const __hip_bfloat16 b) {
834}
835
840__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hneg(const __hip_bfloat16 a) {
841 __hip_bfloat16_raw hr = a;
842 hr.x ^= 0x8000;
843 return __hip_bfloat16(hr);
844}
845
850__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __habs(const __hip_bfloat16 a) {
851 __hip_bfloat16_raw hr = a;
852 hr.x &= 0x7FFF;
853 return __hip_bfloat16(hr);
854}
855
860__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __h2div(const __hip_bfloat162 a,
861 const __hip_bfloat162 b) {
862 return __hip_bfloat162(__float2bfloat16(__bfloat162float(a.x) / __bfloat162float(b.x)),
864}
865
870__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __habs2(const __hip_bfloat162 a) {
871 return __hip_bfloat162(__habs(a.x), __habs(a.y));
872}
873
878__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hadd2(const __hip_bfloat162 a,
879 const __hip_bfloat162 b) {
880 return __hip_bfloat162(__hadd(a.x, b.x), __hadd(a.y, b.y));
881}
882
887__BF16_DEVICE_STATIC__ __hip_bfloat162 __hfma2(const __hip_bfloat162 a, const __hip_bfloat162 b,
888 const __hip_bfloat162 c) {
889 return __hip_bfloat162(__hfma(a.x, b.x, c.x), __hfma(a.y, b.y, c.y));
890}
891
896__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hmul2(const __hip_bfloat162 a,
897 const __hip_bfloat162 b) {
898 return __hip_bfloat162(__hmul(a.x, b.x), __hmul(a.y, b.y));
899}
900
905__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hneg2(const __hip_bfloat162 a) {
906 return __hip_bfloat162(__hneg(a.x), __hneg(a.y));
907}
908
913__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hsub2(const __hip_bfloat162 a,
914 const __hip_bfloat162 b) {
915 return __hip_bfloat162(__hsub(a.x, b.x), __hsub(a.y, b.y));
916}
917
922__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator*(const __hip_bfloat16& l,
923 const __hip_bfloat16& r) {
924 return __hmul(l, r);
925}
926
931__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16& operator*=(__hip_bfloat16& l, const __hip_bfloat16& r) {
932 l = __hmul(l, r);
933 return l;
934}
935
940__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator+(const __hip_bfloat16& l) { return l; }
941
946__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator+(const __hip_bfloat16& l,
947 const __hip_bfloat16& r) {
948 return __hadd(l, r);
949}
950
955__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator-(const __hip_bfloat16& l) { return __hneg(l); }
956
961__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator-(const __hip_bfloat16& l,
962 const __hip_bfloat16& r) {
963 return __hsub(l, r);
964}
965
970__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator++(__hip_bfloat16& l, const int) {
971 auto ret = l;
972 l = __hadd(l, HIPRT_ONE_BF16);
973 return ret;
974}
975
980__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16& operator++(__hip_bfloat16& l) {
981 l = __hadd(l, HIPRT_ONE_BF16);
982 return l;
983}
984
989__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator--(__hip_bfloat16& l, const int) {
990 auto ret = l;
991 l = __hsub(l, HIPRT_ONE_BF16);
992 return ret;
993}
994
999__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16& operator--(__hip_bfloat16& l) {
1000 l = __hsub(l, HIPRT_ONE_BF16);
1001 return l;
1002}
1003
1008__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16& operator+=(__hip_bfloat16& l, const __hip_bfloat16& r) {
1009 l = __hadd(l, r);
1010 return l;
1011}
1012
1017__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16& operator-=(__hip_bfloat16& l, const __hip_bfloat16& r) {
1018 l = __hsub(l, r);
1019 return l;
1020}
1021
1026__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator/(const __hip_bfloat16& l,
1027 const __hip_bfloat16& r) {
1028 return __hdiv(l, r);
1029}
1030
1035__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16& operator/=(__hip_bfloat16& l, const __hip_bfloat16& r) {
1036 l = __hdiv(l, r);
1037 return l;
1038}
1039
1044__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 operator*(const __hip_bfloat162& l,
1045 const __hip_bfloat162& r) {
1046 return __hmul2(l, r);
1047}
1048
1053__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162& operator*=(__hip_bfloat162& l,
1054 const __hip_bfloat162& r) {
1055 l = __hmul2(l, r);
1056 return l;
1057}
1058
1063__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 operator+(const __hip_bfloat162& l) { return l; }
1064
1069__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 operator+(const __hip_bfloat162& l,
1070 const __hip_bfloat162& r) {
1071 return __hadd2(l, r);
1072}
1073
1078__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 operator-(const __hip_bfloat162& l) {
1079 return __hneg2(l);
1080}
1081
1086__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 operator-(const __hip_bfloat162& l,
1087 const __hip_bfloat162& r) {
1088 return __hsub2(l, r);
1089}
1090
1095__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 operator++(__hip_bfloat162& l, const int) {
1096 auto ret = l;
1097 l = __hadd2(l, {HIPRT_ONE_BF16, HIPRT_ONE_BF16});
1098 return ret;
1099}
1100
1105__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162& operator++(__hip_bfloat162& l) {
1106 l = __hadd2(l, {HIPRT_ONE_BF16, HIPRT_ONE_BF16});
1107 return l;
1108}
1109
1114__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 operator--(__hip_bfloat162& l, const int) {
1115 auto ret = l;
1116 l = __hsub2(l, {HIPRT_ONE_BF16, HIPRT_ONE_BF16});
1117 return ret;
1118}
1119
1124__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162& operator--(__hip_bfloat162& l) {
1125 l = __hsub2(l, {HIPRT_ONE_BF16, HIPRT_ONE_BF16});
1126 return l;
1127}
1128
1133__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162& operator+=(__hip_bfloat162& l,
1134 const __hip_bfloat162& r) {
1135 l = __hadd2(l, r);
1136 return l;
1137}
1138
1143__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162& operator-=(__hip_bfloat162& l,
1144 const __hip_bfloat162& r) {
1145 l = __hsub2(l, r);
1146 return l;
1147}
1148
1153__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 operator/(const __hip_bfloat162& l,
1154 const __hip_bfloat162& r) {
1155 return __h2div(l, r);
1156}
1157
1162__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162& operator/=(__hip_bfloat162& l,
1163 const __hip_bfloat162& r) {
1164 l = __h2div(l, r);
1165 return l;
1166}
1167
1172__BF16_HOST_DEVICE_STATIC__ bool __heq(const __hip_bfloat16 a, const __hip_bfloat16 b) {
1173 return __bfloat162float(a) == __bfloat162float(b);
1174}
1175
1180__BF16_HOST_DEVICE_STATIC__ bool __hequ(const __hip_bfloat16 a, const __hip_bfloat16 b) {
1181 return !(__bfloat162float(a) < __bfloat162float(b)) &&
1183}
1184
1189__BF16_HOST_DEVICE_STATIC__ bool __hgt(const __hip_bfloat16 a, const __hip_bfloat16 b) {
1190 return __bfloat162float(a) > __bfloat162float(b);
1191}
1192
1197__BF16_HOST_DEVICE_STATIC__ bool __hgtu(const __hip_bfloat16 a, const __hip_bfloat16 b) {
1198 return !(__bfloat162float(a) <= __bfloat162float(b));
1199}
1200
1205__BF16_HOST_DEVICE_STATIC__ bool __hge(const __hip_bfloat16 a, const __hip_bfloat16 b) {
1206 return __bfloat162float(a) >= __bfloat162float(b);
1207}
1208
1213__BF16_HOST_DEVICE_STATIC__ bool __hgeu(const __hip_bfloat16 a, const __hip_bfloat16 b) {
1214 return !(__bfloat162float(a) < __bfloat162float(b));
1215}
1216
1221__BF16_HOST_DEVICE_STATIC__ bool __hne(const __hip_bfloat16 a, const __hip_bfloat16 b) {
1222 return __bfloat162float(a) != __bfloat162float(b);
1223}
1224
1229__BF16_HOST_DEVICE_STATIC__ bool __hneu(const __hip_bfloat16 a, const __hip_bfloat16 b) {
1230 return !(__bfloat162float(a) == __bfloat162float(b));
1231}
1232
1237__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hmax(const __hip_bfloat16 a, const __hip_bfloat16 b) {
1238#if __HIP_DEVICE_COMPILE__
1239 return __float2bfloat16(__ocml_fmax_f32(__bfloat162float(a), __bfloat162float(b)));
1240#else
1241 return __float2bfloat16(std::max(__bfloat162float(a), __bfloat162float(b)));
1242#endif
1243}
1244
1249__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hmin(const __hip_bfloat16 a, const __hip_bfloat16 b) {
1250#if __HIP_DEVICE_COMPILE__
1251 return __float2bfloat16(__ocml_fmin_f32(__bfloat162float(a), __bfloat162float(b)));
1252#else
1253 return __float2bfloat16(std::min(__bfloat162float(a), __bfloat162float(b)));
1254#endif
1255}
1256
1261__BF16_HOST_DEVICE_STATIC__ bool __hlt(const __hip_bfloat16 a, const __hip_bfloat16 b) {
1262 return __bfloat162float(a) < __bfloat162float(b);
1263}
1264
1269__BF16_HOST_DEVICE_STATIC__ bool __hltu(const __hip_bfloat16 a, const __hip_bfloat16 b) {
1270 return !(__bfloat162float(a) >= __bfloat162float(b));
1271}
1272
1277__BF16_HOST_DEVICE_STATIC__ bool __hle(const __hip_bfloat16 a, const __hip_bfloat16 b) {
1278 return __bfloat162float(a) <= __bfloat162float(b);
1279}
1280
1285__BF16_HOST_DEVICE_STATIC__ bool __hleu(const __hip_bfloat16 a, const __hip_bfloat16 b) {
1286 return !(__bfloat162float(a) > __bfloat162float(b));
1287}
1288
1293__BF16_HOST_DEVICE_STATIC__ int __hisinf(const __hip_bfloat16 a) {
1294 __hip_bfloat16_raw hr = a;
1295 return !(~hr.x & 0x7f80) && !(hr.x & 0x7f);
1296}
1297
1302__BF16_HOST_DEVICE_STATIC__ bool __hisnan(const __hip_bfloat16 a) {
1303 __hip_bfloat16_raw hr = a;
1304 return !(~hr.x & 0x7f80) && +(hr.x & 0x7f);
1305}
1306
1311__BF16_HOST_DEVICE_STATIC__ bool __hbeq2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
1312 return __heq(a.x, b.x) && __heq(a.y, b.y);
1313}
1314
1319__BF16_HOST_DEVICE_STATIC__ bool __hbequ2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
1320 return __hequ(a.x, b.x) && __hequ(a.y, b.y);
1321}
1322
1327__BF16_HOST_DEVICE_STATIC__ bool __hbge2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
1328 return __hge(a.x, b.x) && __hge(a.y, b.y);
1329}
1330
1335__BF16_HOST_DEVICE_STATIC__ bool __hbgeu2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
1336 return __hgeu(a.x, b.x) && __hgeu(a.y, b.y);
1337}
1338
1343__BF16_HOST_DEVICE_STATIC__ bool __hbgt2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
1344 return __hgt(a.x, b.x) && __hgt(a.y, b.y);
1345}
1346
1351__BF16_HOST_DEVICE_STATIC__ bool __hbgtu2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
1352 return __hgtu(a.x, b.x) && __hgtu(a.y, b.y);
1353}
1354
1359__BF16_HOST_DEVICE_STATIC__ bool __hble2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
1360 return __hle(a.x, b.x) && __hle(a.y, b.y);
1361}
1362
1367__BF16_HOST_DEVICE_STATIC__ bool __hbleu2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
1368 return __hleu(a.x, b.x) && __hleu(a.y, b.y);
1369}
1370
1375__BF16_HOST_DEVICE_STATIC__ bool __hblt2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
1376 return __hlt(a.x, b.x) && __hlt(a.y, b.y);
1377}
1378
1383__BF16_HOST_DEVICE_STATIC__ bool __hbltu2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
1384 return __hltu(a.x, b.x) && __hltu(a.y, b.y);
1385}
1386
1391__BF16_HOST_DEVICE_STATIC__ bool __hbne2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
1392 return __hne(a.x, b.x) && __hne(a.y, b.y);
1393}
1394
1399__BF16_HOST_DEVICE_STATIC__ bool __hbneu2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
1400 return __hneu(a.x, b.x) || __hneu(a.y, b.y);
1401}
1402
1407__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __heq2(const __hip_bfloat162 a,
1408 const __hip_bfloat162 b) {
1409 return __hip_bfloat162{{__heq(a.x, b.x) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16},
1410 {__heq(a.y, b.y) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16}};
1411}
1412
1417__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hge2(const __hip_bfloat162 a,
1418 const __hip_bfloat162 b) {
1419 return __hip_bfloat162{{__hge(a.x, b.x) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16},
1420 {__hge(a.y, b.y) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16}};
1421}
1422
1427__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hgt2(const __hip_bfloat162 a,
1428 const __hip_bfloat162 b) {
1429 return __hip_bfloat162{{__hgt(a.x, b.x) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16},
1430 {__hgt(a.y, b.y) ? HIPRT_ONE_BF16 : HIPRT_ONE_BF16}};
1431}
1432
1437__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hisnan2(const __hip_bfloat162 a) {
1438 return __hip_bfloat162{{__hisnan(a.x) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16},
1439 {__hisnan(a.y) ? HIPRT_ONE_BF16 : HIPRT_ONE_BF16}};
1440}
1441
1446__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hle2(const __hip_bfloat162 a,
1447 const __hip_bfloat162 b) {
1448 return __hip_bfloat162{{__hle(a.x, b.x) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16},
1449 {__hle(a.y, b.y) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16}};
1450}
1451
1456__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hlt2(const __hip_bfloat162 a,
1457 const __hip_bfloat162 b) {
1458 return __hip_bfloat162{{__hlt(a.x, b.x) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16},
1459 {__hlt(a.y, b.y) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16}};
1460}
1461
1466__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hmax2(const __hip_bfloat162 a,
1467 const __hip_bfloat162 b) {
1468 return __hip_bfloat162(__hmax(a.x, b.x), __hmax(a.y, b.y));
1469}
1470
1475__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hmin2(const __hip_bfloat162 a,
1476 const __hip_bfloat162 b) {
1477 return __hip_bfloat162(__hmin(a.x, b.x), __hmin(a.y, b.y));
1478}
1479
1484__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hne2(const __hip_bfloat162 a,
1485 const __hip_bfloat162 b) {
1486 return __hip_bfloat162{{__hne(a.x, b.x) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16},
1487 {__hne(a.y, b.y) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16}};
1488}
1489
1494__BF16_HOST_DEVICE_STATIC__ bool operator==(const __hip_bfloat16& l, const __hip_bfloat16& r) {
1495 return __heq(l, r);
1496}
1497
1502__BF16_HOST_DEVICE_STATIC__ bool operator!=(const __hip_bfloat16& l, const __hip_bfloat16& r) {
1503 return __hne(l, r);
1504}
1505
1510__BF16_HOST_DEVICE_STATIC__ bool operator<(const __hip_bfloat16& l, const __hip_bfloat16& r) {
1511 return __hlt(l, r);
1512}
1513
1518__BF16_HOST_DEVICE_STATIC__ bool operator<=(const __hip_bfloat16& l, const __hip_bfloat16& r) {
1519 return __hle(l, r);
1520}
1521
1526__BF16_HOST_DEVICE_STATIC__ bool operator>(const __hip_bfloat16& l, const __hip_bfloat16& r) {
1527 return __hgt(l, r);
1528}
1529
1534__BF16_HOST_DEVICE_STATIC__ bool operator>=(const __hip_bfloat16& l, const __hip_bfloat16& r) {
1535 return __hge(l, r);
1536}
1537
1542__BF16_HOST_DEVICE_STATIC__ bool operator==(const __hip_bfloat162& l, const __hip_bfloat162& r) {
1543 float2 ret = __heq2(l, r);
1544 return ret.x != 0.0f && ret.y != 0.0f;
1545}
1546
1551__BF16_HOST_DEVICE_STATIC__ bool operator!=(const __hip_bfloat162& l, const __hip_bfloat162& r) {
1552 return !(l == r);
1553}
1554
1559__BF16_HOST_DEVICE_STATIC__ bool operator<(const __hip_bfloat162& l, const __hip_bfloat162& r) {
1560 float2 fl = l, fr = r;
1561 return fl.x < fr.x && fl.x < fr.y;
1562}
1563
1568__BF16_HOST_DEVICE_STATIC__ bool operator<=(const __hip_bfloat162& l, const __hip_bfloat162& r) {
1569 float2 fl = l, fr = r;
1570 return fl.x <= fr.x && fl.x <= fr.y;
1571}
1572
1577__BF16_HOST_DEVICE_STATIC__ bool operator>(const __hip_bfloat162& l, const __hip_bfloat162& r) {
1578 float2 fl = l, fr = r;
1579 return fl.x > fr.x && fl.x > fr.y;
1580}
1581
1586__BF16_HOST_DEVICE_STATIC__ bool operator>=(const __hip_bfloat162& l, const __hip_bfloat162& r) {
1587 float2 fl = l, fr = r;
1588 return fl.x >= fr.x && fl.x >= fr.y;
1589}
1590
1595__BF16_DEVICE_STATIC__ __hip_bfloat16 hceil(const __hip_bfloat16 h) {
1596 return __float2bfloat16(__ocml_ceil_f32(__bfloat162float(h)));
1597}
1598
1603__BF16_DEVICE_STATIC__ __hip_bfloat16 hcos(const __hip_bfloat16 h) {
1604 return __float2bfloat16(__ocml_cos_f32(__bfloat162float(h)));
1605}
1606
1611__BF16_DEVICE_STATIC__ __hip_bfloat16 hexp(const __hip_bfloat16 h) {
1612 return __float2bfloat16(__ocml_exp_f32(__bfloat162float(h)));
1613}
1614
1619__BF16_DEVICE_STATIC__ __hip_bfloat16 hexp10(const __hip_bfloat16 h) {
1620 return __float2bfloat16(__ocml_exp10_f32(__bfloat162float(h)));
1621}
1622
1627__BF16_DEVICE_STATIC__ __hip_bfloat16 hexp2(const __hip_bfloat16 h) {
1628 return __float2bfloat16(__ocml_exp2_f32(__bfloat162float(h)));
1629}
1630
1635__BF16_DEVICE_STATIC__ __hip_bfloat16 hfloor(const __hip_bfloat16 h) {
1636 return __float2bfloat16(__ocml_floor_f32(__bfloat162float(h)));
1637}
1638
1643__BF16_DEVICE_STATIC__ __hip_bfloat16 hlog(const __hip_bfloat16 h) {
1644 return __float2bfloat16(__ocml_log_f32(__bfloat162float(h)));
1645}
1646
1651__BF16_DEVICE_STATIC__ __hip_bfloat16 hlog10(const __hip_bfloat16 h) {
1652 return __float2bfloat16(__ocml_log10_f32(__bfloat162float(h)));
1653}
1654
1659__BF16_DEVICE_STATIC__ __hip_bfloat16 hlog2(const __hip_bfloat16 h) {
1660 return __float2bfloat16(__ocml_log2_f32(__bfloat162float(h)));
1661}
1662
1667__BF16_DEVICE_STATIC__ __hip_bfloat16 hrcp(const __hip_bfloat16 h) {
1668 return __float2bfloat16(1.0f / (__bfloat162float(h)));
1669}
1670
1675__BF16_DEVICE_STATIC__ __hip_bfloat16 hrint(const __hip_bfloat16 h) {
1676 return __float2bfloat16(__ocml_rint_f32(__bfloat162float(h)));
1677}
1678
1683__BF16_DEVICE_STATIC__ __hip_bfloat16 hrsqrt(const __hip_bfloat16 h) {
1684 return __float2bfloat16(__ocml_rsqrt_f32(__bfloat162float(h)));
1685}
1686
1691__BF16_DEVICE_STATIC__ __hip_bfloat16 hsin(const __hip_bfloat16 h) {
1692 return __float2bfloat16(__ocml_sin_f32(__bfloat162float(h)));
1693}
1694
1699__BF16_DEVICE_STATIC__ __hip_bfloat16 hsqrt(const __hip_bfloat16 h) {
1700 return __float2bfloat16(__ocml_sqrt_f32(__bfloat162float(h)));
1701}
1702
1707__BF16_DEVICE_STATIC__ __hip_bfloat16 htrunc(const __hip_bfloat16 h) {
1708 return __float2bfloat16(__ocml_trunc_f32(__bfloat162float(h)));
1709}
1710
1715__BF16_DEVICE_STATIC__ __hip_bfloat162 h2ceil(const __hip_bfloat162 h) {
1716 return __hip_bfloat162(hceil(h.x), hceil(h.y));
1717}
1718
1723__BF16_DEVICE_STATIC__ __hip_bfloat162 h2cos(const __hip_bfloat162 h) {
1724 return __hip_bfloat162(hcos(h.x), hcos(h.y));
1725}
1726
1731__BF16_DEVICE_STATIC__ __hip_bfloat162 h2exp(const __hip_bfloat162 h) {
1732 return __hip_bfloat162(hexp(h.x), hexp(h.y));
1733}
1734
1739__BF16_DEVICE_STATIC__ __hip_bfloat162 h2exp10(const __hip_bfloat162 h) {
1740 return __hip_bfloat162(hexp10(h.x), hexp10(h.y));
1741}
1742
1747__BF16_DEVICE_STATIC__ __hip_bfloat162 h2exp2(const __hip_bfloat162 h) {
1748 return __hip_bfloat162(hexp2(h.x), hexp2(h.y));
1749}
1750
1755__BF16_DEVICE_STATIC__ __hip_bfloat162 h2floor(const __hip_bfloat162 h) {
1756 return __hip_bfloat162(hfloor(h.x), hfloor(h.y));
1757}
1758
1763__BF16_DEVICE_STATIC__ __hip_bfloat162 h2log(const __hip_bfloat162 h) {
1764 return __hip_bfloat162(hlog(h.x), hlog(h.y));
1765}
1766
1771__BF16_DEVICE_STATIC__ __hip_bfloat162 h2log10(const __hip_bfloat162 h) {
1772 return __hip_bfloat162(hlog10(h.x), hlog10(h.y));
1773}
1774
1779__BF16_DEVICE_STATIC__ __hip_bfloat162 h2log2(const __hip_bfloat162 h) {
1780 return __hip_bfloat162(hlog2(h.x), hlog2(h.y));
1781}
1782
1787__BF16_DEVICE_STATIC__ __hip_bfloat162 h2rcp(const __hip_bfloat162 h) {
1788 return __hip_bfloat162(hrcp(h.x), hrcp(h.y));
1789}
1790
1795__BF16_DEVICE_STATIC__ __hip_bfloat162 h2rint(const __hip_bfloat162 h) {
1796 return __hip_bfloat162(hrint(h.x), hrint(h.y));
1797}
1798
1803__BF16_DEVICE_STATIC__ __hip_bfloat162 h2rsqrt(const __hip_bfloat162 h) {
1804 return __hip_bfloat162(hrsqrt(h.x), hrsqrt(h.y));
1805}
1806
1811__BF16_DEVICE_STATIC__ __hip_bfloat162 h2sin(const __hip_bfloat162 h) {
1812 return __hip_bfloat162(hsin(h.x), hsin(h.y));
1813}
1814
1819__BF16_DEVICE_STATIC__ __hip_bfloat162 h2sqrt(const __hip_bfloat162 h) {
1820 return __hip_bfloat162(hsqrt(h.x), hsqrt(h.y));
1821}
1822
1827__BF16_DEVICE_STATIC__ __hip_bfloat162 h2trunc(const __hip_bfloat162 h) {
1828 return __hip_bfloat162(htrunc(h.x), htrunc(h.y));
1829}
1830
1831#if defined(__clang__) && defined(__HIP__)
1836__BF16_DEVICE_STATIC__ __hip_bfloat162 unsafeAtomicAdd(__hip_bfloat162* address,
1837 __hip_bfloat162 value) {
1838#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__) && __has_builtin(__builtin_amdgcn_flat_atomic_fadd_v2bf16)
1839 typedef short __attribute__((ext_vector_type(2))) vec_short2;
1840 __hip_bfloat162_raw bf2_v = value;
1841 vec_short2 s2_in{bf2_v.x, bf2_v.y};
1842 vec_short2 s2_ret = __builtin_amdgcn_flat_atomic_fadd_v2bf16((vec_short2*)address, s2_in);
1843 return __hip_bfloat162_raw{s2_ret[0], s2_ret[1]};
1844#else
1845 static_assert(sizeof(unsigned int) == sizeof(__hip_bfloat162_raw));
1846 union u_hold {
1847 __hip_bfloat162_raw h2r;
1848 unsigned int u32;
1849 };
1850 u_hold old_val, new_val;
1851 old_val.u32 =
1852 __hip_atomic_load((unsigned int*)address, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
1853 do {
1854 new_val.h2r = __hadd2(old_val.h2r, value);
1855 } while (!__hip_atomic_compare_exchange_strong((unsigned int*)address, &old_val.u32, new_val.u32,
1856 __ATOMIC_RELAXED, __ATOMIC_RELAXED,
1857 __HIP_MEMORY_SCOPE_AGENT));
1858 return old_val.h2r;
1859#endif
1860}
1861#endif // defined(__clang__) && defined(__HIP__)
1862#endif
Contains declarations for types and functions in device library. Uses int64_t and uint64_t instead of...
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hsub(const __hip_bfloat16 a, const __hip_bfloat16 b)
Subtracts two bfloat16 values.
Definition amd_hip_bf16.h:806
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 & operator-=(__hip_bfloat16 &l, const __hip_bfloat16 &r)
Operator to subtract-assign two __hip_bfloat16 numbers.
Definition amd_hip_bf16.h:1017
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator+(const __hip_bfloat16 &l)
Operator to unary+ on a __hip_bfloat16 number.
Definition amd_hip_bf16.h:940
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator/(const __hip_bfloat16 &l, const __hip_bfloat16 &r)
Operator to divide two __hip_bfloat16 numbers.
Definition amd_hip_bf16.h:1026
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator-(const __hip_bfloat16 &l)
Operator to negate a __hip_bfloat16 number.
Definition amd_hip_bf16.h:955
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hneg(const __hip_bfloat16 a)
Negate a bfloat16 value.
Definition amd_hip_bf16.h:840
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hadd(const __hip_bfloat16 a, const __hip_bfloat16 b)
Adds two bfloat16 values.
Definition amd_hip_bf16.h:798
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 & operator/=(__hip_bfloat16 &l, const __hip_bfloat16 &r)
Operator to divide-assign two __hip_bfloat16 numbers.
Definition amd_hip_bf16.h:1035
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator*(const __hip_bfloat16 &l, const __hip_bfloat16 &r)
Operator to multiply two __hip_bfloat16 numbers.
Definition amd_hip_bf16.h:922
__BF16_DEVICE_STATIC__ __hip_bfloat16 __hfma(const __hip_bfloat16 a, const __hip_bfloat16 b, const __hip_bfloat16 c)
Performs FMA of given bfloat16 values.
Definition amd_hip_bf16.h:822
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 & operator*=(__hip_bfloat16 &l, const __hip_bfloat16 &r)
Operator to multiply-assign two __hip_bfloat16 numbers.
Definition amd_hip_bf16.h:931
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hmul(const __hip_bfloat16 a, const __hip_bfloat16 b)
Multiplies two bfloat16 values.
Definition amd_hip_bf16.h:832
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator++(__hip_bfloat16 &l, const int)
Operator to post increment a __hip_bfloat16 number.
Definition amd_hip_bf16.h:970
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 & operator+=(__hip_bfloat16 &l, const __hip_bfloat16 &r)
Operator to add-assign two __hip_bfloat16 numbers.
Definition amd_hip_bf16.h:1008
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __habs(const __hip_bfloat16 a)
Returns absolute of a bfloat16.
Definition amd_hip_bf16.h:850
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator--(__hip_bfloat16 &l, const int)
Operator to post decrement a __hip_bfloat16 number.
Definition amd_hip_bf16.h:989
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hdiv(const __hip_bfloat16 a, const __hip_bfloat16 b)
Divides two bfloat16 values.
Definition amd_hip_bf16.h:814
__BF16_HOST_DEVICE_STATIC__ bool __heq(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values.
Definition amd_hip_bf16.h:1172
__BF16_HOST_DEVICE_STATIC__ bool __hgt(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - greater than.
Definition amd_hip_bf16.h:1189
__BF16_HOST_DEVICE_STATIC__ bool __hne(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - not equal.
Definition amd_hip_bf16.h:1221
__BF16_HOST_DEVICE_STATIC__ bool operator==(const __hip_bfloat16 &l, const __hip_bfloat16 &r)
Operator to perform an equal compare on two __hip_bfloat16 numbers.
Definition amd_hip_bf16.h:1494
__BF16_HOST_DEVICE_STATIC__ bool __hle(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - less than equal.
Definition amd_hip_bf16.h:1277
__BF16_HOST_DEVICE_STATIC__ bool operator!=(const __hip_bfloat16 &l, const __hip_bfloat16 &r)
Operator to perform a not equal on two __hip_bfloat16 numbers.
Definition amd_hip_bf16.h:1502
__BF16_HOST_DEVICE_STATIC__ bool operator>(const __hip_bfloat16 &l, const __hip_bfloat16 &r)
Operator to perform a greater than on two __hip_bfloat16 numbers.
Definition amd_hip_bf16.h:1526
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hmin(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - return min.
Definition amd_hip_bf16.h:1249
__BF16_HOST_DEVICE_STATIC__ bool __hge(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - greater than equal.
Definition amd_hip_bf16.h:1205
__BF16_HOST_DEVICE_STATIC__ bool __hleu(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - unordered less than equal.
Definition amd_hip_bf16.h:1285
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hmax(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - return max.
Definition amd_hip_bf16.h:1237
__BF16_HOST_DEVICE_STATIC__ bool operator<=(const __hip_bfloat16 &l, const __hip_bfloat16 &r)
Operator to perform a less than equal on two __hip_bfloat16 numbers.
Definition amd_hip_bf16.h:1518
__BF16_HOST_DEVICE_STATIC__ int __hisinf(const __hip_bfloat16 a)
Checks if number is inf.
Definition amd_hip_bf16.h:1293
__BF16_HOST_DEVICE_STATIC__ bool __hneu(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - unordered not equal.
Definition amd_hip_bf16.h:1229
__BF16_HOST_DEVICE_STATIC__ bool __hgtu(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - unordered greater than.
Definition amd_hip_bf16.h:1197
__BF16_HOST_DEVICE_STATIC__ bool operator<(const __hip_bfloat16 &l, const __hip_bfloat16 &r)
Operator to perform a less than on two __hip_bfloat16 numbers.
Definition amd_hip_bf16.h:1510
__BF16_HOST_DEVICE_STATIC__ bool __hltu(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - unordered less than.
Definition amd_hip_bf16.h:1269
__BF16_HOST_DEVICE_STATIC__ bool __hgeu(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - unordered greater than equal.
Definition amd_hip_bf16.h:1213
__BF16_HOST_DEVICE_STATIC__ bool __hlt(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - less than operator.
Definition amd_hip_bf16.h:1261
__BF16_HOST_DEVICE_STATIC__ bool operator>=(const __hip_bfloat16 &l, const __hip_bfloat16 &r)
Operator to perform a greater than equal on two __hip_bfloat16 numbers.
Definition amd_hip_bf16.h:1534
__BF16_HOST_DEVICE_STATIC__ bool __hisnan(const __hip_bfloat16 a)
Checks if number is nan.
Definition amd_hip_bf16.h:1302
__BF16_HOST_DEVICE_STATIC__ bool __hequ(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - unordered equal.
Definition amd_hip_bf16.h:1180
__BF16_HOST_DEVICE_STATIC__ bool __hbgeu2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a >= b - unordered.
Definition amd_hip_bf16.h:1335
__BF16_HOST_DEVICE_STATIC__ bool __hbgt2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a > b.
Definition amd_hip_bf16.h:1343
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hmax2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Returns max of two elements.
Definition amd_hip_bf16.h:1466
__BF16_HOST_DEVICE_STATIC__ bool __hble2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a <= b.
Definition amd_hip_bf16.h:1359
__BF16_HOST_DEVICE_STATIC__ bool __hblt2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a < b.
Definition amd_hip_bf16.h:1375
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hmin2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Returns min of two elements.
Definition amd_hip_bf16.h:1475
__BF16_HOST_DEVICE_STATIC__ bool __hbleu2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a <= b - unordered.
Definition amd_hip_bf16.h:1367
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hne2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Checks for not equal to.
Definition amd_hip_bf16.h:1484
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hge2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a >= b, returns 1.0 if greater than equal, otherwise 0.0.
Definition amd_hip_bf16.h:1417
__BF16_HOST_DEVICE_STATIC__ bool __hbne2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a != b.
Definition amd_hip_bf16.h:1391
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hlt2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a < b, returns 1.0 if greater than equal, otherwise 0.0.
Definition amd_hip_bf16.h:1456
__BF16_HOST_DEVICE_STATIC__ bool __hbltu2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a < b - unordered.
Definition amd_hip_bf16.h:1383
__BF16_HOST_DEVICE_STATIC__ bool __hbequ2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Checks if two numbers are equal - unordered.
Definition amd_hip_bf16.h:1319
__BF16_HOST_DEVICE_STATIC__ bool __hbeq2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Checks if two numbers are equal.
Definition amd_hip_bf16.h:1311
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hle2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a <= b, returns 1.0 if greater than equal, otherwise 0.0.
Definition amd_hip_bf16.h:1446
__BF16_HOST_DEVICE_STATIC__ bool __hbneu2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a != b.
Definition amd_hip_bf16.h:1399
__BF16_HOST_DEVICE_STATIC__ bool __hbgtu2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a > b - unordered.
Definition amd_hip_bf16.h:1351
__BF16_HOST_DEVICE_STATIC__ bool __hbge2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a >= b.
Definition amd_hip_bf16.h:1327
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __heq2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a != b, returns 1.0 if equal, otherwise 0.0.
Definition amd_hip_bf16.h:1407
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hgt2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a > b, returns 1.0 if greater than equal, otherwise 0.0.
Definition amd_hip_bf16.h:1427
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hisnan2(const __hip_bfloat162 a)
Check for a is NaN, returns 1.0 if NaN, otherwise 0.0.
Definition amd_hip_bf16.h:1437
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __habs2(const __hip_bfloat162 a)
Returns absolute of a bfloat162.
Definition amd_hip_bf16.h:870
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hsub2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Subtracts two bfloat162 values.
Definition amd_hip_bf16.h:913
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hmul2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Multiplies two bfloat162 values.
Definition amd_hip_bf16.h:896
__BF16_DEVICE_STATIC__ __hip_bfloat162 __hfma2(const __hip_bfloat162 a, const __hip_bfloat162 b, const __hip_bfloat162 c)
Performs FMA of given bfloat162 values.
Definition amd_hip_bf16.h:887
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hadd2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Adds two bfloat162 values.
Definition amd_hip_bf16.h:878
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hneg2(const __hip_bfloat162 a)
Converts a bfloat162 into negative.
Definition amd_hip_bf16.h:905
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __h2div(const __hip_bfloat162 a, const __hip_bfloat162 b)
Divides bfloat162 values.
Definition amd_hip_bf16.h:860
__BF16_HOST_DEVICE_STATIC__ float __bfloat162float(__hip_bfloat16 a)
Converts bfloat16 to float.
Definition amd_hip_bf16.h:505
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __float2bfloat16(float f)
Converts float to bfloat16.
Definition amd_hip_bf16.h:514
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __bfloat162bfloat162(const __hip_bfloat16 a)
Moves bfloat16 value to bfloat162.
Definition amd_hip_bf16.h:532
__BF16_HOST_DEVICE_STATIC__ unsigned short int __bfloat16_as_ushort(const __hip_bfloat16 h)
Reinterprets bits in a __hip_bfloat16 as an unsigned signed short integer.
Definition amd_hip_bf16.h:553
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __ushort_as_bfloat16(const unsigned short int a)
Reinterprets unsigned short int into a bfloat16.
Definition amd_hip_bf16.h:675
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __low2bfloat16(const __hip_bfloat162 a)
Returns low 16 bits of __hip_bfloat162.
Definition amd_hip_bf16.h:623
__BF16_HOST_DEVICE_STATIC__ float2 __bfloat1622float2(const __hip_bfloat162 a)
Converts and moves bfloat162 to float2.
Definition amd_hip_bf16.h:523
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __halves2bfloat162(const __hip_bfloat16 a, const __hip_bfloat16 b)
Combine two __hip_bfloat16 to __hip_bfloat162.
Definition amd_hip_bf16.h:583
__BF16_HOST_DEVICE_STATIC__ short int __bfloat16_as_short(const __hip_bfloat16 h)
Reinterprets bits in a __hip_bfloat16 as a signed short integer.
Definition amd_hip_bf16.h:540
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __double2bfloat16(const double a)
Convert double to __hip_bfloat16.
Definition amd_hip_bf16.h:566
__BF16_HOST_DEVICE_STATIC__ float __low2float(const __hip_bfloat162 a)
Converts low 16 bits of __hip_bfloat162 to float and returns the result.
Definition amd_hip_bf16.h:637
__BF16_HOST_DEVICE_STATIC__ float __high2float(const __hip_bfloat162 a)
Converts high 16 bits of __hip_bfloat162 to float and returns the result.
Definition amd_hip_bf16.h:606
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __low2bfloat162(const __hip_bfloat162 a)
Returns low 16 bits of __hip_bfloat162.
Definition amd_hip_bf16.h:629
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __float22bfloat162_rn(const float2 a)
Convert float2 to __hip_bfloat162.
Definition amd_hip_bf16.h:575
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __highs2bfloat162(const __hip_bfloat162 a, const __hip_bfloat162 b)
Extracts high 16 bits from each and combines them.
Definition amd_hip_bf16.h:614
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __lows2bfloat162(const __hip_bfloat162 a, const __hip_bfloat162 b)
Extracts low 16 bits from each and combines them.
Definition amd_hip_bf16.h:653
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __high2bfloat16(const __hip_bfloat162 a)
Returns high 16 bits of __hip_bfloat162.
Definition amd_hip_bf16.h:592
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __high2bfloat162(const __hip_bfloat162 a)
Returns high 16 bits of __hip_bfloat162.
Definition amd_hip_bf16.h:598
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __short_as_bfloat16(const short int a)
Reinterprets short int into a bfloat16.
Definition amd_hip_bf16.h:662
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __lowhigh2highlow(const __hip_bfloat162 a)
Swaps both halves.
Definition amd_hip_bf16.h:645
__BF16_DEVICE_STATIC__ __hip_bfloat16 hexp10(const __hip_bfloat16 h)
Calculate exponential 10 of bfloat16.
Definition amd_hip_bf16.h:1619
__BF16_DEVICE_STATIC__ __hip_bfloat16 hlog(const __hip_bfloat16 h)
Calculate natural log of bfloat16.
Definition amd_hip_bf16.h:1643
__BF16_DEVICE_STATIC__ __hip_bfloat16 hexp2(const __hip_bfloat16 h)
Calculate exponential 2 of bfloat16.
Definition amd_hip_bf16.h:1627
__BF16_DEVICE_STATIC__ __hip_bfloat16 hceil(const __hip_bfloat16 h)
Calculate ceil of bfloat16.
Definition amd_hip_bf16.h:1595
__BF16_DEVICE_STATIC__ __hip_bfloat16 hrcp(const __hip_bfloat16 h)
Calculate reciprocal.
Definition amd_hip_bf16.h:1667
__BF16_DEVICE_STATIC__ __hip_bfloat16 hsqrt(const __hip_bfloat16 h)
Calculate sqrt of bfloat16.
Definition amd_hip_bf16.h:1699
__BF16_DEVICE_STATIC__ __hip_bfloat16 hlog10(const __hip_bfloat16 h)
Calculate log 10 of bfloat16.
Definition amd_hip_bf16.h:1651
__BF16_DEVICE_STATIC__ __hip_bfloat16 hsin(const __hip_bfloat16 h)
Calculate sin of bfloat16.
Definition amd_hip_bf16.h:1691
__BF16_DEVICE_STATIC__ __hip_bfloat16 hfloor(const __hip_bfloat16 h)
Calculate floor of bfloat16.
Definition amd_hip_bf16.h:1635
__BF16_DEVICE_STATIC__ __hip_bfloat16 hrint(const __hip_bfloat16 h)
Round to nearest int.
Definition amd_hip_bf16.h:1675
__BF16_DEVICE_STATIC__ __hip_bfloat16 htrunc(const __hip_bfloat16 h)
Calculate truncate of bfloat16.
Definition amd_hip_bf16.h:1707
__BF16_DEVICE_STATIC__ __hip_bfloat16 hrsqrt(const __hip_bfloat16 h)
Reciprocal square root.
Definition amd_hip_bf16.h:1683
__BF16_DEVICE_STATIC__ __hip_bfloat16 hcos(const __hip_bfloat16 h)
Calculate cosine of bfloat16.
Definition amd_hip_bf16.h:1603
__BF16_DEVICE_STATIC__ __hip_bfloat16 hlog2(const __hip_bfloat16 h)
Calculate log 2 of bfloat16.
Definition amd_hip_bf16.h:1659
__BF16_DEVICE_STATIC__ __hip_bfloat16 hexp(const __hip_bfloat16 h)
Calculate exponential of bfloat16.
Definition amd_hip_bf16.h:1611
__BF16_DEVICE_STATIC__ __hip_bfloat162 h2trunc(const __hip_bfloat162 h)
Calculate truncate of bfloat162.
Definition amd_hip_bf16.h:1827
__BF16_DEVICE_STATIC__ __hip_bfloat162 h2rcp(const __hip_bfloat162 h)
Calculate vector reciprocal.
Definition amd_hip_bf16.h:1787
__BF16_DEVICE_STATIC__ __hip_bfloat162 h2log(const __hip_bfloat162 h)
Calculate natural log of bfloat162.
Definition amd_hip_bf16.h:1763
__BF16_DEVICE_STATIC__ __hip_bfloat162 h2exp(const __hip_bfloat162 h)
Calculate exponential of bfloat162.
Definition amd_hip_bf16.h:1731
__BF16_DEVICE_STATIC__ __hip_bfloat162 h2cos(const __hip_bfloat162 h)
Calculate cosine of bfloat162.
Definition amd_hip_bf16.h:1723
__BF16_DEVICE_STATIC__ __hip_bfloat162 h2sin(const __hip_bfloat162 h)
Calculate sin of bfloat162.
Definition amd_hip_bf16.h:1811
__BF16_DEVICE_STATIC__ __hip_bfloat162 h2log2(const __hip_bfloat162 h)
Calculate log 2 of bfloat162.
Definition amd_hip_bf16.h:1779
__BF16_DEVICE_STATIC__ __hip_bfloat162 h2ceil(const __hip_bfloat162 h)
Calculate ceil of bfloat162.
Definition amd_hip_bf16.h:1715
__BF16_DEVICE_STATIC__ __hip_bfloat162 h2floor(const __hip_bfloat162 h)
Calculate floor of bfloat162.
Definition amd_hip_bf16.h:1755
__BF16_DEVICE_STATIC__ __hip_bfloat162 h2exp10(const __hip_bfloat162 h)
Calculate exponential 10 of bfloat162.
Definition amd_hip_bf16.h:1739
__BF16_DEVICE_STATIC__ __hip_bfloat162 h2exp2(const __hip_bfloat162 h)
Calculate exponential 2 of bfloat162.
Definition amd_hip_bf16.h:1747
__BF16_DEVICE_STATIC__ __hip_bfloat162 h2log10(const __hip_bfloat162 h)
Calculate log 10 of bfloat162.
Definition amd_hip_bf16.h:1771
__BF16_DEVICE_STATIC__ __hip_bfloat162 h2rsqrt(const __hip_bfloat162 h)
Calculate vector reciprocal square root.
Definition amd_hip_bf16.h:1803
__BF16_DEVICE_STATIC__ __hip_bfloat162 h2rint(const __hip_bfloat162 h)
Calculate vector round to nearest int.
Definition amd_hip_bf16.h:1795
__BF16_DEVICE_STATIC__ __hip_bfloat162 h2sqrt(const __hip_bfloat162 h)
Calculate sqrt of bfloat162.
Definition amd_hip_bf16.h:1819
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Definition hip_fp16_math_fwd.h:57
Definition amd_hip_vector_types.h:2035