HIP: Heterogenous-computing Interface for Portability
amd_hip_bf16.h
Go to the documentation of this file.
1 
96 #ifndef _HIP_INCLUDE_HIP_AMD_DETAIL_HIP_BF16_H_
97 #define _HIP_INCLUDE_HIP_AMD_DETAIL_HIP_BF16_H_
98 
99 #if !defined(__HIPCC_RTC__)
100 #include <hip/amd_detail/amd_hip_common.h>
101 #endif // !defined(__HIPCC_RTC__)
102 
103 #include "amd_hip_vector_types.h" // float2 etc
104 #include "device_library_decls.h" // ocml conversion functions
105 #include "math_fwd.h" // ocml device functions
106 
107 #define __BF16_DEVICE__ __device__
108 #if defined(__HIPCC_RTC__)
109 #define __BF16_HOST_DEVICE__ __BF16_DEVICE__
110 #else
111 #include <algorithm>
112 #include <climits>
113 #include <cmath>
114 #define __BF16_HOST_DEVICE__ __host__ __BF16_DEVICE__
115 #endif
116 #define __BF16_DEVICE_STATIC__ __BF16_DEVICE__ static inline
117 #define __BF16_HOST_DEVICE_STATIC__ __BF16_HOST_DEVICE__ static inline
118 
119 #if defined(__AVX512VL__) and defined(__AVX512BF16__) and not defined(__HIP_DEVICE_COMPILE__)
120 // Enable with -mavx512vl -mavx512bf16
121 #if defined(__MINGW64__)
122 #include <intrin.h>
123 #else
124 #include <immintrin.h>
125 #endif
126 #define HIP_BF16_AVX512_OP 1
127 static_assert(sizeof(__bf16) == sizeof(unsigned short),
128  "sizeof __bf16 should match sizeof unsigned short");
129 #else
130 #define HIP_BF16_AVX512_OP 0
131 #endif
132 
133 #define HIPRT_ONE_BF16 __float2bfloat16(1.0f)
134 #define HIPRT_ZERO_BF16 __float2bfloat16(0.0f)
135 #define HIPRT_INF_BF16 __ushort_as_bfloat16((unsigned short)0x7F80U)
136 #define HIPRT_MAX_NORMAL_BF16 __ushort_as_bfloat16((unsigned short)0x7F7FU)
137 #define HIPRT_MIN_DENORM_BF16 __ushort_as_bfloat16((unsigned short)0x0001U)
138 #define HIPRT_NAN_BF16 __ushort_as_bfloat16((unsigned short)0x7FFFU)
139 #define HIPRT_NEG_ZERO_BF16 __ushort_as_bfloat16((unsigned short)0x8000U)
140 
141 // Since we are using unsigned short to represent data in bfloat16, it can be of different sizes on
142 // different machines. These naive checks should prevent some undefined behavior on systems which
143 // have different sizes for basic types.
144 #if !defined(__HIPCC_RTC__)
145 static_assert(CHAR_BIT == 8, "byte size should be of 8 bits");
146 #endif
147 static_assert(sizeof(unsigned short) == 2, "size of unsigned short should be 2 bytes");
148 
153 typedef struct __attribute__((aligned(2))) {
154  unsigned short x;
155 } __hip_bfloat16_raw;
156 
161 typedef struct __attribute__((aligned(4))) {
162  unsigned short x;
163  unsigned short y;
164 } __hip_bfloat162_raw;
165 
172 struct __attribute__((aligned(2))) __hip_bfloat16 {
173  private:
174  __BF16_HOST_DEVICE_STATIC__ float bfloatraw_2_float(unsigned short val) {
175 #if HIP_BF16_AVX512_OP
176  union {
177  unsigned short us;
178  __bf16 bf16;
179  } u = {val};
180  return _mm_cvtsbh_ss(u.bf16);
181 #else
182  unsigned int uval = val << 16;
183  union {
184  unsigned int u32;
185  float fp32;
186  } u = {uval};
187  return u.fp32;
188 #endif
189  }
190  __BF16_HOST_DEVICE_STATIC__ unsigned short float_2_bfloatraw(float f) {
191 #if HIP_BF16_AVX512_OP
192  union {
193  __bf16 bf16;
194  unsigned short us;
195  } u = {_mm_cvtness_sbh(f)};
196  return u.us;
197 #else
198  union {
199  float fp32;
200  unsigned int u32;
201  } u = {f};
202  if (~u.u32 & 0x7f800000) {
203  // When the exponent bits are not all 1s, then the value is zero, normal,
204  // or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus
205  // 1 if the least significant bit of the bfloat16 mantissa is 1 (odd).
206  // This causes the bfloat16's mantissa to be incremented by 1 if the 16
207  // least significant bits of the float mantissa are greater than 0x8000,
208  // or if they are equal to 0x8000 and the least significant bit of the
209  // bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when
210  // the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already
211  // has the value 0x7f, then incrementing it causes it to become 0x00 and
212  // the exponent is incremented by one, which is the next higher FP value
213  // to the unrounded bfloat16 value. When the bfloat16 value is subnormal
214  // with an exponent of 0x00 and a mantissa of 0x7F, it may be rounded up
215  // to a normal value with an exponent of 0x01 and a mantissa of 0x00.
216  // When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F,
217  // incrementing it causes it to become an exponent of 0xFF and a mantissa
218  // of 0x00, which is Inf, the next higher value to the unrounded value.
219  u.u32 += 0x7fff + ((u.u32 >> 16) & 1); // Round to nearest, round to even
220  } else if (u.u32 & 0xffff) {
221  // When all of the exponent bits are 1, the value is Inf or NaN.
222  // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
223  // mantissa bit. Quiet NaN is indicated by the most significant mantissa
224  // bit being 1. Signaling NaN is indicated by the most significant
225  // mantissa bit being 0 but some other bit(s) being 1. If any of the
226  // lower 16 bits of the mantissa are 1, we set the least significant bit
227  // of the bfloat16 mantissa, in order to preserve signaling NaN in case
228  // the bloat16's mantissa bits are all 0.
229  u.u32 |= 0x10000; // Preserve signaling NaN
230  }
231  return static_cast<unsigned short>(u.u32 >> 16);
232 #endif
233  }
234 
235  __BF16_HOST_DEVICE_STATIC__ unsigned short double_2_bfloatraw(double d_in) {
236  union {
237  float fp32;
238  unsigned int u32;
239  } u = {static_cast<float>(d_in)};
240  double d = u.fp32;
241 
242  // Round to odd
243  if ((d_in > 0.0 && d > d_in) || (d_in < 0.0 && d < d_in)) {
244  u.u32--;
245  u.u32 |= 1;
246  }
247 
248  return float_2_bfloatraw(u.fp32);
249  }
250 
251  protected:
253  unsigned short __x;
254 
255  public:
256  // TODO: SWDEV-452411
257  // Need to add constructor of __hip_bfloat16 from
258  // unsigned long long
259  // long long
260  // long
261  // unsigned long
262  // Casting directly to double might lead to double rounding.
263 
265  __BF16_HOST_DEVICE__ __hip_bfloat16(unsigned int val)
266  : __x(double_2_bfloatraw(static_cast<double>(val))) {}
267 
269  __BF16_HOST_DEVICE__ __hip_bfloat16(int val)
270  : __x(double_2_bfloatraw(static_cast<double>(val))) {}
271 
273  __BF16_HOST_DEVICE__ __hip_bfloat16(unsigned short val)
274  : __x(float_2_bfloatraw(static_cast<float>(val))) {}
275 
277  __BF16_HOST_DEVICE__ __hip_bfloat16(short val)
278  : __x(float_2_bfloatraw(static_cast<float>(val))) {}
279 
281  __BF16_HOST_DEVICE__ __hip_bfloat16(const double val) : __x(double_2_bfloatraw(val)) {}
282 
284  __BF16_HOST_DEVICE__ __hip_bfloat16(const float val) : __x(float_2_bfloatraw(val)) {}
285 
287  __BF16_HOST_DEVICE__ __hip_bfloat16(const __hip_bfloat16_raw& val) : __x(val.x) {}
288 
290  __BF16_HOST_DEVICE__ __hip_bfloat16() = default;
291 
293  __BF16_HOST_DEVICE__ operator __hip_bfloat16_raw() const { return __hip_bfloat16_raw{__x}; }
294 
296  __BF16_HOST_DEVICE__ operator __hip_bfloat16_raw() const volatile {
297  return __hip_bfloat16_raw{__x};
298  }
299 
301  __BF16_HOST_DEVICE__ operator bool() const {
302  auto val = bfloatraw_2_float(__x);
303  return val != 0.0f && val != -0.0f;
304  }
305 
307  __BF16_HOST_DEVICE__ operator char() const { return static_cast<char>(bfloatraw_2_float(__x)); }
308 
310  __BF16_HOST_DEVICE__ operator float() const { return bfloatraw_2_float(__x); }
311 
313  __BF16_HOST_DEVICE__ operator int() const { return static_cast<int>(bfloatraw_2_float(__x)); }
314 
316  __BF16_HOST_DEVICE__ operator long() const { return static_cast<long>(bfloatraw_2_float(__x)); }
317 
319  __BF16_HOST_DEVICE__ operator long long() const {
320  return static_cast<long long>(bfloatraw_2_float(__x));
321  }
322 
324  __BF16_HOST_DEVICE__ operator short() const { return static_cast<short>(bfloatraw_2_float(__x)); }
325 
327  __BF16_HOST_DEVICE__ operator signed char() const {
328  return static_cast<signed char>(bfloatraw_2_float(__x));
329  }
330 
332  __BF16_HOST_DEVICE__ operator unsigned char() const {
333  return static_cast<unsigned char>(bfloatraw_2_float(__x));
334  }
335 
337  __BF16_HOST_DEVICE__ operator unsigned int() const {
338  return static_cast<unsigned int>(bfloatraw_2_float(__x));
339  }
340 
342  __BF16_HOST_DEVICE__ operator unsigned long() const {
343  return static_cast<unsigned long>(bfloatraw_2_float(__x));
344  }
345 
347  __BF16_HOST_DEVICE__ operator unsigned long long() const {
348  return static_cast<unsigned long long>(bfloatraw_2_float(__x));
349  }
350 
352  __BF16_HOST_DEVICE__ operator unsigned short() const {
353  return static_cast<unsigned short>(bfloatraw_2_float(__x));
354  }
355 
356  // TODO: SWDEV-452411 add operator which converts unsigned long long and long long to bfloat
357 
359  __BF16_HOST_DEVICE__ __hip_bfloat16& operator=(unsigned int val) {
360  __x = float_2_bfloatraw(static_cast<float>(val));
361  return *this;
362  }
363 
365  __BF16_HOST_DEVICE__ __hip_bfloat16& operator=(int val) {
366  __x = float_2_bfloatraw(static_cast<float>(val));
367  return *this;
368  }
369 
371  __BF16_HOST_DEVICE__ __hip_bfloat16& operator=(unsigned short val) {
372  __x = float_2_bfloatraw(static_cast<float>(val));
373  return *this;
374  }
375 
377  __BF16_HOST_DEVICE__ __hip_bfloat16& operator=(short val) {
378  __x = float_2_bfloatraw(static_cast<float>(val));
379  return *this;
380  }
381 
383  __BF16_HOST_DEVICE__ __hip_bfloat16& operator=(const double f) {
384  __x = float_2_bfloatraw(static_cast<float>(f));
385  return *this;
386  }
387 
389  __BF16_HOST_DEVICE__ __hip_bfloat16& operator=(const float f) {
390  __x = float_2_bfloatraw(f);
391  return *this;
392  }
393 
395  __BF16_HOST_DEVICE__ __hip_bfloat16& operator=(const __hip_bfloat16_raw& hr) {
396  __x = hr.x;
397  return *this;
398  }
399 
401  __BF16_HOST_DEVICE__ volatile __hip_bfloat16& operator=(const __hip_bfloat16_raw& hr) volatile {
402  __x = hr.x;
403  return *this;
404  }
405 
407  __BF16_HOST_DEVICE__ volatile __hip_bfloat16& operator=(
408  const volatile __hip_bfloat16_raw& hr) volatile {
409  __x = hr.x;
410  return *this;
411  }
412 };
421 struct __attribute__((aligned(4))) __hip_bfloat162 {
422  public:
423  __hip_bfloat16 x;
424  __hip_bfloat16 y;
427  public:
429  __BF16_HOST_DEVICE__ __hip_bfloat162(const __hip_bfloat162_raw& h2r)
430  : x(__hip_bfloat16(__hip_bfloat16_raw{h2r.x})),
431  y(__hip_bfloat16(__hip_bfloat16_raw{h2r.y})) {}
432 
434  __BF16_HOST_DEVICE__ __hip_bfloat162(const __hip_bfloat162& val) {
435  __hip_bfloat162_raw hr = val;
436  x = __hip_bfloat16_raw{hr.x};
437  y = __hip_bfloat16_raw{hr.y};
438  }
439 
441  __BF16_HOST_DEVICE__ __hip_bfloat162(const __hip_bfloat16& a, const __hip_bfloat16& b)
442  : x(a), y(b) {}
443 
445  __BF16_HOST_DEVICE__ __hip_bfloat162() = default;
446 
448  __BF16_HOST_DEVICE__ operator __hip_bfloat162_raw() const {
449  __hip_bfloat16_raw l = x;
450  __hip_bfloat16_raw r = y;
451  return __hip_bfloat162_raw{l.x, r.x};
452  }
453 
455  __BF16_HOST_DEVICE__ operator float2() const {
456 #if HIP_BF16_AVX512_OP
457  union {
458  __hip_bfloat162_raw raw2;
459  __bf16 bf162[2];
460  static_assert(sizeof(__bf16[2]) == sizeof(__hip_bfloat162_raw));
461  } u;
462  u.raw2 = *this;
463  __m128bh pbf16{u.bf162[0], u.bf162[1], 0, 0};
464  __m128 pf32 = _mm_cvtpbh_ps(pbf16);
465  float2 ret(pf32[0], pf32[1]);
466 #else
467  float2 ret(x, y);
468 #endif
469  return ret;
470  }
471 
473  __BF16_HOST_DEVICE__ __hip_bfloat162& operator=(const __hip_bfloat162_raw& h2r) {
474  x = __hip_bfloat16(__hip_bfloat16_raw{h2r.x});
475  y = __hip_bfloat16(__hip_bfloat16_raw{h2r.y});
476  return *this;
477  }
478 
480  __BF16_HOST_DEVICE__ __hip_bfloat162& operator=(const __hip_bfloat162& src) {
481  __hip_bfloat162_raw hr = src;
482  x = __hip_bfloat16(__hip_bfloat16_raw{hr.x});
483  y = __hip_bfloat16(__hip_bfloat16_raw{hr.y});
484  return *this;
485  }
486 };
493 __BF16_HOST_DEVICE_STATIC__ float __bfloat162float(__hip_bfloat16 a) {
494  float ret = a;
495  return ret;
496 }
497 
502 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __float2bfloat16(float f) {
503  __hip_bfloat16 ret{f};
504  return ret;
505 }
506 
511 __BF16_HOST_DEVICE_STATIC__ float2 __bfloat1622float2(const __hip_bfloat162 a) {
512  float2 ret = a;
513  return ret;
514 }
515 
520 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __bfloat162bfloat162(const __hip_bfloat16 a) {
521  return __hip_bfloat162(a, a);
522 }
523 
528 __BF16_HOST_DEVICE_STATIC__ short int __bfloat16_as_short(const __hip_bfloat16 h) {
529  short ret = h;
530  return ret;
531 }
532 
537 __BF16_HOST_DEVICE_STATIC__ unsigned short int __bfloat16_as_ushort(const __hip_bfloat16 h) {
538  unsigned short ret = h;
539  return ret;
540 }
541 
546 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __double2bfloat16(const double a) {
547  __hip_bfloat16 ret{a};
548  return ret;
549 }
550 
555 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __float22bfloat162_rn(const float2 a) {
556  return __hip_bfloat162{__float2bfloat16(a.x), __float2bfloat16(a.y)};
557 }
558 
563 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __halves2bfloat162(const __hip_bfloat16 a,
564  const __hip_bfloat16 b) {
565  return __hip_bfloat162(a, b);
566 }
567 
572 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __high2bfloat16(const __hip_bfloat162 a) {
573  __hip_bfloat162_raw hr = a;
574  return __hip_bfloat16(__hip_bfloat16_raw{hr.y});
575 }
576 
581 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __high2bfloat162(const __hip_bfloat162 a) {
582  __hip_bfloat162_raw hr = a;
583  return __hip_bfloat162(__hip_bfloat16_raw{hr.y}, __hip_bfloat16_raw{hr.y});
584 }
585 
590 __BF16_HOST_DEVICE_STATIC__ float __high2float(const __hip_bfloat162 a) {
591  __hip_bfloat162_raw hr = a;
592  return __bfloat162float(__hip_bfloat16(__hip_bfloat16_raw{hr.y}));
593 }
594 
599 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __highs2bfloat162(const __hip_bfloat162 a,
600  const __hip_bfloat162 b) {
601  __hip_bfloat162_raw hr_a = a;
602  __hip_bfloat162_raw hr_b = b;
603  return __hip_bfloat162(__hip_bfloat162_raw{hr_a.y, hr_b.y});
604 }
605 
610 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __low2bfloat16(const __hip_bfloat162 a) {
611  __hip_bfloat162_raw hr = a;
612  return __hip_bfloat16(hr.x);
613 }
614 
619 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __low2bfloat162(const __hip_bfloat162 a) {
620  __hip_bfloat162_raw hr = a;
621  return __hip_bfloat162(hr.x, hr.x);
622 }
623 
628 __BF16_HOST_DEVICE_STATIC__ float __low2float(const __hip_bfloat162 a) {
629  __hip_bfloat162_raw hr = a;
630  return __bfloat162float(__hip_bfloat16(__hip_bfloat16_raw{hr.x}));
631 }
632 
637 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __lowhigh2highlow(const __hip_bfloat162 a) {
638  __hip_bfloat162_raw hr = a;
639  return __hip_bfloat162(__hip_bfloat162_raw{hr.y, hr.x});
640 }
641 
646 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __lows2bfloat162(const __hip_bfloat162 a,
647  const __hip_bfloat162 b) {
648  __hip_bfloat162_raw hr_a = a;
649  __hip_bfloat162_raw hr_b = b;
650  return __hip_bfloat162(__hip_bfloat162_raw{hr_a.x, hr_b.x});
651 }
652 
657 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __short_as_bfloat16(const short int a) {
658  return __hip_bfloat16(a);
659 }
660 
665 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __ushort_as_bfloat16(const unsigned short int a) {
666  return __hip_bfloat16(a);
667 }
668 
673 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hadd(const __hip_bfloat16 a, const __hip_bfloat16 b) {
675 }
676 
681 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hsub(const __hip_bfloat16 a, const __hip_bfloat16 b) {
683 }
684 
689 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hdiv(const __hip_bfloat16 a, const __hip_bfloat16 b) {
691 }
692 
697 __BF16_DEVICE_STATIC__ __hip_bfloat16 __hfma(const __hip_bfloat16 a, const __hip_bfloat16 b,
698  const __hip_bfloat16 c) {
699  return __float2bfloat16(
700  __ocml_fma_f32(__bfloat162float(a), __bfloat162float(b), __bfloat162float(c)));
701 }
702 
707 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hmul(const __hip_bfloat16 a, const __hip_bfloat16 b) {
709 }
710 
715 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hneg(const __hip_bfloat16 a) {
716  __hip_bfloat16_raw hr = a;
717  hr.x ^= 0x8000;
718  return __hip_bfloat16(hr);
719 }
720 
725 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __habs(const __hip_bfloat16 a) {
726  __hip_bfloat16_raw hr = a;
727  hr.x &= 0x7FFF;
728  return __hip_bfloat16(hr);
729 }
730 
735 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __h2div(const __hip_bfloat162 a,
736  const __hip_bfloat162 b) {
737  __hip_bfloat162_raw hr_a = a;
738  __hip_bfloat162_raw hr_b = b;
739  return __hip_bfloat162(__float2bfloat16(__bfloat162float(__hip_bfloat16_raw{hr_a.x}) /
740  __bfloat162float(__hip_bfloat16_raw{hr_b.x})),
741  __float2bfloat16(__bfloat162float(__hip_bfloat16_raw{hr_a.y}) /
742  __bfloat162float(__hip_bfloat16_raw{hr_b.y})));
743 }
744 
749 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __habs2(const __hip_bfloat162 a) {
750  __hip_bfloat162_raw hr_a = a;
751  return __hip_bfloat162(__habs(__hip_bfloat16_raw{hr_a.x}), __habs(__hip_bfloat16_raw{hr_a.y}));
752 }
753 
758 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hadd2(const __hip_bfloat162 a,
759  const __hip_bfloat162 b) {
760  __hip_bfloat162_raw hr_a = a;
761  __hip_bfloat162_raw hr_b = b;
762  return __hip_bfloat162(__hadd(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}),
763  __hadd(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y}));
764 }
765 
770 __BF16_DEVICE_STATIC__ __hip_bfloat162 __hfma2(const __hip_bfloat162 a, const __hip_bfloat162 b,
771  const __hip_bfloat162 c) {
772  __hip_bfloat162_raw hr_a = a;
773  __hip_bfloat162_raw hr_b = b;
774  __hip_bfloat162_raw hr_c = c;
775  return __hip_bfloat162(
776  __hfma(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}, __hip_bfloat16_raw{hr_c.x}),
777  __hfma(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y}, __hip_bfloat16_raw{hr_c.y}));
778 }
779 
784 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hmul2(const __hip_bfloat162 a,
785  const __hip_bfloat162 b) {
786  __hip_bfloat162_raw hr_a = a;
787  __hip_bfloat162_raw hr_b = b;
788  return __hip_bfloat162(__hmul(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}),
789  __hmul(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y}));
790 }
791 
796 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hneg2(const __hip_bfloat162 a) {
797  __hip_bfloat162_raw hr_a = a;
798  return __hip_bfloat162(__hneg(__hip_bfloat16_raw{hr_a.x}), __hneg(__hip_bfloat16_raw{hr_a.y}));
799 }
800 
805 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hsub2(const __hip_bfloat162 a,
806  const __hip_bfloat162 b) {
807  __hip_bfloat162_raw hr_a = a;
808  __hip_bfloat162_raw hr_b = b;
809  return __hip_bfloat162(__hsub(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}),
810  __hsub(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y}));
811 }
812 
817 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator*(const __hip_bfloat16& l,
818  const __hip_bfloat16& r) {
819  return __hmul(l, r);
820 }
821 
826 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16& operator*=(__hip_bfloat16& l, const __hip_bfloat16& r) {
827  l = __hmul(l, r);
828  return l;
829 }
830 
835 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator+(const __hip_bfloat16& l) { return l; }
836 
841 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator+(const __hip_bfloat16& l,
842  const __hip_bfloat16& r) {
843  return __hadd(l, r);
844 }
845 
850 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator-(const __hip_bfloat16& l) { return __hneg(l); }
851 
856 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator-(const __hip_bfloat16& l,
857  const __hip_bfloat16& r) {
858  return __hsub(l, r);
859 }
860 
865 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator++(__hip_bfloat16& l, const int) {
866  auto ret = l;
867  l = __hadd(l, HIPRT_ONE_BF16);
868  return ret;
869 }
870 
875 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16& operator++(__hip_bfloat16& l) {
876  l = __hadd(l, HIPRT_ONE_BF16);
877  return l;
878 }
879 
884 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator--(__hip_bfloat16& l, const int) {
885  auto ret = l;
886  l = __hsub(l, HIPRT_ONE_BF16);
887  return ret;
888 }
889 
894 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16& operator--(__hip_bfloat16& l) {
895  l = __hsub(l, HIPRT_ONE_BF16);
896  return l;
897 }
898 
903 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16& operator+=(__hip_bfloat16& l, const __hip_bfloat16& r) {
904  l = __hadd(l, r);
905  return l;
906 }
907 
912 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16& operator-=(__hip_bfloat16& l, const __hip_bfloat16& r) {
913  l = __hsub(l, r);
914  return l;
915 }
916 
921 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator/(const __hip_bfloat16& l,
922  const __hip_bfloat16& r) {
923  return __hdiv(l, r);
924 }
925 
930 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16& operator/=(__hip_bfloat16& l, const __hip_bfloat16& r) {
931  l = __hdiv(l, r);
932  return l;
933 }
934 
939 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 operator*(const __hip_bfloat162& l,
940  const __hip_bfloat162& r) {
941  return __hmul2(l, r);
942 }
943 
948 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162& operator*=(__hip_bfloat162& l,
949  const __hip_bfloat162& r) {
950  l = __hmul2(l, r);
951  return l;
952 }
953 
958 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 operator+(const __hip_bfloat162& l) { return l; }
959 
964 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 operator+(const __hip_bfloat162& l,
965  const __hip_bfloat162& r) {
966  return __hadd2(l, r);
967 }
968 
973 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 operator-(const __hip_bfloat162& l) {
974  return __hneg2(l);
975 }
976 
981 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 operator-(const __hip_bfloat162& l,
982  const __hip_bfloat162& r) {
983  return __hsub2(l, r);
984 }
985 
990 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 operator++(__hip_bfloat162& l, const int) {
991  auto ret = l;
992  l = __hadd2(l, {HIPRT_ONE_BF16, HIPRT_ONE_BF16});
993  return ret;
994 }
995 
1000 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162& operator++(__hip_bfloat162& l) {
1001  l = __hadd2(l, {HIPRT_ONE_BF16, HIPRT_ONE_BF16});
1002  return l;
1003 }
1004 
1009 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 operator--(__hip_bfloat162& l, const int) {
1010  auto ret = l;
1011  l = __hsub2(l, {HIPRT_ONE_BF16, HIPRT_ONE_BF16});
1012  return ret;
1013 }
1014 
1019 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162& operator--(__hip_bfloat162& l) {
1020  l = __hsub2(l, {HIPRT_ONE_BF16, HIPRT_ONE_BF16});
1021  return l;
1022 }
1023 
1028 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162& operator+=(__hip_bfloat162& l,
1029  const __hip_bfloat162& r) {
1030  l = __hadd2(l, r);
1031  return l;
1032 }
1033 
1038 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162& operator-=(__hip_bfloat162& l,
1039  const __hip_bfloat162& r) {
1040  l = __hsub2(l, r);
1041  return l;
1042 }
1043 
1048 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 operator/(const __hip_bfloat162& l,
1049  const __hip_bfloat162& r) {
1050  return __h2div(l, r);
1051 }
1052 
1057 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162& operator/=(__hip_bfloat162& l,
1058  const __hip_bfloat162& r) {
1059  l = __h2div(l, r);
1060  return l;
1061 }
1062 
1067 __BF16_HOST_DEVICE_STATIC__ bool __heq(const __hip_bfloat16 a, const __hip_bfloat16 b) {
1068  return __bfloat162float(a) == __bfloat162float(b);
1069 }
1070 
1075 __BF16_HOST_DEVICE_STATIC__ bool __hequ(const __hip_bfloat16 a, const __hip_bfloat16 b) {
1076  return !(__bfloat162float(a) < __bfloat162float(b)) &&
1078 }
1079 
1084 __BF16_HOST_DEVICE_STATIC__ bool __hgt(const __hip_bfloat16 a, const __hip_bfloat16 b) {
1085  return __bfloat162float(a) > __bfloat162float(b);
1086 }
1087 
1092 __BF16_HOST_DEVICE_STATIC__ bool __hgtu(const __hip_bfloat16 a, const __hip_bfloat16 b) {
1093  return !(__bfloat162float(a) <= __bfloat162float(b));
1094 }
1095 
1100 __BF16_HOST_DEVICE_STATIC__ bool __hge(const __hip_bfloat16 a, const __hip_bfloat16 b) {
1101  return __bfloat162float(a) >= __bfloat162float(b);
1102 }
1103 
1108 __BF16_HOST_DEVICE_STATIC__ bool __hgeu(const __hip_bfloat16 a, const __hip_bfloat16 b) {
1109  return !(__bfloat162float(a) < __bfloat162float(b));
1110 }
1111 
1116 __BF16_HOST_DEVICE_STATIC__ bool __hne(const __hip_bfloat16 a, const __hip_bfloat16 b) {
1117  return __bfloat162float(a) != __bfloat162float(b);
1118 }
1119 
1124 __BF16_HOST_DEVICE_STATIC__ bool __hneu(const __hip_bfloat16 a, const __hip_bfloat16 b) {
1125  return !(__bfloat162float(a) == __bfloat162float(b));
1126 }
1127 
1132 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hmax(const __hip_bfloat16 a, const __hip_bfloat16 b) {
1133 #if __HIP_DEVICE_COMPILE__
1134  return __float2bfloat16(__ocml_fmax_f32(__bfloat162float(a), __bfloat162float(b)));
1135 #else
1136  return __float2bfloat16(std::max(__bfloat162float(a), __bfloat162float(b)));
1137 #endif
1138 }
1139 
1144 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hmin(const __hip_bfloat16 a, const __hip_bfloat16 b) {
1145 #if __HIP_DEVICE_COMPILE__
1146  return __float2bfloat16(__ocml_fmin_f32(__bfloat162float(a), __bfloat162float(b)));
1147 #else
1148  return __float2bfloat16(std::min(__bfloat162float(a), __bfloat162float(b)));
1149 #endif
1150 }
1151 
1156 __BF16_HOST_DEVICE_STATIC__ bool __hlt(const __hip_bfloat16 a, const __hip_bfloat16 b) {
1157  return __bfloat162float(a) < __bfloat162float(b);
1158 }
1159 
1164 __BF16_HOST_DEVICE_STATIC__ bool __hltu(const __hip_bfloat16 a, const __hip_bfloat16 b) {
1165  return !(__bfloat162float(a) >= __bfloat162float(b));
1166 }
1167 
1172 __BF16_HOST_DEVICE_STATIC__ bool __hle(const __hip_bfloat16 a, const __hip_bfloat16 b) {
1173  return __bfloat162float(a) <= __bfloat162float(b);
1174 }
1175 
1180 __BF16_HOST_DEVICE_STATIC__ bool __hleu(const __hip_bfloat16 a, const __hip_bfloat16 b) {
1181  return !(__bfloat162float(a) > __bfloat162float(b));
1182 }
1183 
1188 __BF16_HOST_DEVICE_STATIC__ int __hisinf(const __hip_bfloat16 a) {
1189  __hip_bfloat16_raw hr = a;
1190  return !(~hr.x & 0x7f80) && !(hr.x & 0x7f);
1191 }
1192 
1197 __BF16_HOST_DEVICE_STATIC__ bool __hisnan(const __hip_bfloat16 a) {
1198  __hip_bfloat16_raw hr = a;
1199  return !(~hr.x & 0x7f80) && +(hr.x & 0x7f);
1200 }
1201 
1206 __BF16_HOST_DEVICE_STATIC__ bool __hbeq2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
1207  __hip_bfloat162_raw hr_a = a;
1208  __hip_bfloat162_raw hr_b = b;
1209  return __heq(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) &&
1210  __heq(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y});
1211 }
1212 
1217 __BF16_HOST_DEVICE_STATIC__ bool __hbequ2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
1218  __hip_bfloat162_raw hr_a = a;
1219  __hip_bfloat162_raw hr_b = b;
1220  return __hequ(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) &&
1221  __hequ(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y});
1222 }
1223 
1228 __BF16_HOST_DEVICE_STATIC__ bool __hbge2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
1229  __hip_bfloat162_raw hr_a = a;
1230  __hip_bfloat162_raw hr_b = b;
1231  return __hge(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) &&
1232  __hge(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y});
1233 }
1234 
1239 __BF16_HOST_DEVICE_STATIC__ bool __hbgeu2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
1240  __hip_bfloat162_raw hr_a = a;
1241  __hip_bfloat162_raw hr_b = b;
1242  return __hgeu(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) &&
1243  __hgeu(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y});
1244 }
1245 
1250 __BF16_HOST_DEVICE_STATIC__ bool __hbgt2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
1251  __hip_bfloat162_raw hr_a = a;
1252  __hip_bfloat162_raw hr_b = b;
1253  return __hgt(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) &&
1254  __hgt(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y});
1255 }
1256 
1261 __BF16_HOST_DEVICE_STATIC__ bool __hbgtu2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
1262  __hip_bfloat162_raw hr_a = a;
1263  __hip_bfloat162_raw hr_b = b;
1264  return __hgtu(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) &&
1265  __hgtu(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y});
1266 }
1267 
1272 __BF16_HOST_DEVICE_STATIC__ bool __hble2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
1273  __hip_bfloat162_raw hr_a = a;
1274  __hip_bfloat162_raw hr_b = b;
1275  return __hle(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) &&
1276  __hle(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y});
1277 }
1278 
1283 __BF16_HOST_DEVICE_STATIC__ bool __hbleu2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
1284  __hip_bfloat162_raw hr_a = a;
1285  __hip_bfloat162_raw hr_b = b;
1286  return __hleu(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) &&
1287  __hleu(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y});
1288 }
1289 
1294 __BF16_HOST_DEVICE_STATIC__ bool __hblt2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
1295  __hip_bfloat162_raw hr_a = a;
1296  __hip_bfloat162_raw hr_b = b;
1297  return __hlt(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) &&
1298  __hlt(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y});
1299 }
1300 
1305 __BF16_HOST_DEVICE_STATIC__ bool __hbltu2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
1306  __hip_bfloat162_raw hr_a = a;
1307  __hip_bfloat162_raw hr_b = b;
1308  return __hltu(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) &&
1309  __hltu(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y});
1310 }
1311 
1316 __BF16_HOST_DEVICE_STATIC__ bool __hbne2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
1317  __hip_bfloat162_raw hr_a = a;
1318  __hip_bfloat162_raw hr_b = b;
1319  return __hne(__hip_bfloat16(__hip_bfloat16_raw{hr_a.x}),
1320  __hip_bfloat16(__hip_bfloat16_raw{hr_b.x})) &&
1321  __hne(__hip_bfloat16(__hip_bfloat16_raw{hr_a.y}), __hip_bfloat16(__hip_bfloat16_raw{hr_b.y}));
1322 }
1323 
1328 __BF16_HOST_DEVICE_STATIC__ bool __hbneu2(const __hip_bfloat162 a, const __hip_bfloat162 b) {
1329  __hip_bfloat162_raw hr_a = a;
1330  __hip_bfloat162_raw hr_b = b;
1331  return __hneu(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) ||
1332  __hneu(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y});
1333 }
1334 
1339 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __heq2(const __hip_bfloat162 a,
1340  const __hip_bfloat162 b) {
1341  __hip_bfloat162_raw hr_a = a;
1342  __hip_bfloat162_raw hr_b = b;
1343  return __hip_bfloat162{
1344  {__heq(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) ? HIPRT_ONE_BF16
1345  : HIPRT_ZERO_BF16},
1346  {__heq(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y}) ? HIPRT_ONE_BF16
1347  : HIPRT_ZERO_BF16}};
1348 }
1349 
1354 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hge2(const __hip_bfloat162 a,
1355  const __hip_bfloat162 b) {
1356  __hip_bfloat162_raw hr_a = a;
1357  __hip_bfloat162_raw hr_b = b;
1358  return __hip_bfloat162{
1359  {__hge(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) ? HIPRT_ONE_BF16
1360  : HIPRT_ZERO_BF16},
1361  {__hge(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y}) ? HIPRT_ONE_BF16
1362  : HIPRT_ZERO_BF16}};
1363 }
1364 
1369 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hgt2(const __hip_bfloat162 a,
1370  const __hip_bfloat162 b) {
1371  __hip_bfloat162_raw hr_a = a;
1372  __hip_bfloat162_raw hr_b = b;
1373  return __hip_bfloat162{
1374  {__hgt(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) ? HIPRT_ONE_BF16
1375  : HIPRT_ZERO_BF16},
1376  {__hgt(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y}) ? HIPRT_ONE_BF16
1377  : HIPRT_ONE_BF16}};
1378 }
1379 
1384 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hisnan2(const __hip_bfloat162 a) {
1385  __hip_bfloat162_raw hr_a = a;
1386  return __hip_bfloat162{{__hisnan(__hip_bfloat16_raw{hr_a.x}) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16},
1387  {__hisnan(__hip_bfloat16_raw{hr_a.y}) ? HIPRT_ONE_BF16 : HIPRT_ONE_BF16}};
1388 }
1389 
1394 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hle2(const __hip_bfloat162 a,
1395  const __hip_bfloat162 b) {
1396  __hip_bfloat162_raw hr_a = a;
1397  __hip_bfloat162_raw hr_b = b;
1398  return __hip_bfloat162{
1399  {__hle(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) ? HIPRT_ONE_BF16
1400  : HIPRT_ZERO_BF16},
1401  {__hle(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y}) ? HIPRT_ONE_BF16
1402  : HIPRT_ZERO_BF16}};
1403 }
1404 
1409 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hlt2(const __hip_bfloat162 a,
1410  const __hip_bfloat162 b) {
1411  __hip_bfloat162_raw hr_a = a;
1412  __hip_bfloat162_raw hr_b = b;
1413  return __hip_bfloat162{
1414  {__hlt(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) ? HIPRT_ONE_BF16
1415  : HIPRT_ZERO_BF16},
1416  {__hlt(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y}) ? HIPRT_ONE_BF16
1417  : HIPRT_ZERO_BF16}};
1418 }
1419 
1424 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hmax2(const __hip_bfloat162 a,
1425  const __hip_bfloat162 b) {
1426  __hip_bfloat162_raw hr_a = a;
1427  __hip_bfloat162_raw hr_b = b;
1428  return __hip_bfloat162(__hmax(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}),
1429  __hmax(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y}));
1430 }
1431 
1436 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hmin2(const __hip_bfloat162 a,
1437  const __hip_bfloat162 b) {
1438  __hip_bfloat162_raw hr_a = a;
1439  __hip_bfloat162_raw hr_b = b;
1440  return __hip_bfloat162(__hmin(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}),
1441  __hmin(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y}));
1442 }
1443 
1448 __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hne2(const __hip_bfloat162 a,
1449  const __hip_bfloat162 b) {
1450  __hip_bfloat162_raw hr_a = a;
1451  __hip_bfloat162_raw hr_b = b;
1452  return __hip_bfloat162{
1453  {__hne(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) ? HIPRT_ONE_BF16
1454  : HIPRT_ZERO_BF16},
1455  {__hne(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y}) ? HIPRT_ONE_BF16
1456  : HIPRT_ZERO_BF16}};
1457 }
1458 
1463 __BF16_HOST_DEVICE_STATIC__ bool operator==(const __hip_bfloat16& l, const __hip_bfloat16& r) {
1464  return __heq(l, r);
1465 }
1466 
1471 __BF16_HOST_DEVICE_STATIC__ bool operator!=(const __hip_bfloat16& l, const __hip_bfloat16& r) {
1472  return __hne(l, r);
1473 }
1474 
1479 __BF16_HOST_DEVICE_STATIC__ bool operator<(const __hip_bfloat16& l, const __hip_bfloat16& r) {
1480  return __hlt(l, r);
1481 }
1482 
1487 __BF16_HOST_DEVICE_STATIC__ bool operator<=(const __hip_bfloat16& l, const __hip_bfloat16& r) {
1488  return __hle(l, r);
1489 }
1490 
1495 __BF16_HOST_DEVICE_STATIC__ bool operator>(const __hip_bfloat16& l, const __hip_bfloat16& r) {
1496  return __hgt(l, r);
1497 }
1498 
1503 __BF16_HOST_DEVICE_STATIC__ bool operator>=(const __hip_bfloat16& l, const __hip_bfloat16& r) {
1504  return __hge(l, r);
1505 }
1506 
1511 __BF16_HOST_DEVICE_STATIC__ bool operator==(const __hip_bfloat162& l, const __hip_bfloat162& r) {
1512  float2 ret = __heq2(l, r);
1513  return ret.x != 0.0f && ret.y != 0.0f;
1514 }
1515 
1520 __BF16_HOST_DEVICE_STATIC__ bool operator!=(const __hip_bfloat162& l, const __hip_bfloat162& r) {
1521  return !(l == r);
1522 }
1523 
1528 __BF16_HOST_DEVICE_STATIC__ bool operator<(const __hip_bfloat162& l, const __hip_bfloat162& r) {
1529  float2 fl = l, fr = r;
1530  return fl.x < fr.x && fl.x < fr.y;
1531 }
1532 
1537 __BF16_HOST_DEVICE_STATIC__ bool operator<=(const __hip_bfloat162& l, const __hip_bfloat162& r) {
1538  float2 fl = l, fr = r;
1539  return fl.x <= fr.x && fl.x <= fr.y;
1540 }
1541 
1546 __BF16_HOST_DEVICE_STATIC__ bool operator>(const __hip_bfloat162& l, const __hip_bfloat162& r) {
1547  float2 fl = l, fr = r;
1548  return fl.x > fr.x && fl.x > fr.y;
1549 }
1550 
1555 __BF16_HOST_DEVICE_STATIC__ bool operator>=(const __hip_bfloat162& l, const __hip_bfloat162& r) {
1556  float2 fl = l, fr = r;
1557  return fl.x >= fr.x && fl.x >= fr.y;
1558 }
1559 
1564 __BF16_DEVICE_STATIC__ __hip_bfloat16 hceil(const __hip_bfloat16 h) {
1565  return __float2bfloat16(__ocml_ceil_f32(__bfloat162float(h)));
1566 }
1567 
1572 __BF16_DEVICE_STATIC__ __hip_bfloat16 hcos(const __hip_bfloat16 h) {
1573  return __float2bfloat16(__ocml_cos_f32(__bfloat162float(h)));
1574 }
1575 
1580 __BF16_DEVICE_STATIC__ __hip_bfloat16 hexp(const __hip_bfloat16 h) {
1581  return __float2bfloat16(__ocml_exp_f32(__bfloat162float(h)));
1582 }
1583 
1588 __BF16_DEVICE_STATIC__ __hip_bfloat16 hexp10(const __hip_bfloat16 h) {
1589  return __float2bfloat16(__ocml_exp10_f32(__bfloat162float(h)));
1590 }
1591 
1596 __BF16_DEVICE_STATIC__ __hip_bfloat16 hexp2(const __hip_bfloat16 h) {
1597  return __float2bfloat16(__ocml_exp2_f32(__bfloat162float(h)));
1598 }
1599 
1604 __BF16_DEVICE_STATIC__ __hip_bfloat16 hfloor(const __hip_bfloat16 h) {
1605  return __float2bfloat16(__ocml_floor_f32(__bfloat162float(h)));
1606 }
1607 
1612 __BF16_DEVICE_STATIC__ __hip_bfloat16 hlog(const __hip_bfloat16 h) {
1613  return __float2bfloat16(__ocml_log_f32(__bfloat162float(h)));
1614 }
1615 
1620 __BF16_DEVICE_STATIC__ __hip_bfloat16 hlog10(const __hip_bfloat16 h) {
1621  return __float2bfloat16(__ocml_log10_f32(__bfloat162float(h)));
1622 }
1623 
1628 __BF16_DEVICE_STATIC__ __hip_bfloat16 hlog2(const __hip_bfloat16 h) {
1629  return __float2bfloat16(__ocml_log2_f32(__bfloat162float(h)));
1630 }
1631 
1636 __BF16_DEVICE_STATIC__ __hip_bfloat16 hrcp(const __hip_bfloat16 h) {
1637  return __float2bfloat16(1.0f / (__bfloat162float(h)));
1638 }
1639 
1644 __BF16_DEVICE_STATIC__ __hip_bfloat16 hrint(const __hip_bfloat16 h) {
1645  return __float2bfloat16(__ocml_rint_f32(__bfloat162float(h)));
1646 }
1647 
1652 __BF16_DEVICE_STATIC__ __hip_bfloat16 hrsqrt(const __hip_bfloat16 h) {
1653  return __float2bfloat16(__ocml_rsqrt_f32(__bfloat162float(h)));
1654 }
1655 
1660 __BF16_DEVICE_STATIC__ __hip_bfloat16 hsin(const __hip_bfloat16 h) {
1661  return __float2bfloat16(__ocml_sin_f32(__bfloat162float(h)));
1662 }
1663 
1668 __BF16_DEVICE_STATIC__ __hip_bfloat16 hsqrt(const __hip_bfloat16 h) {
1669  return __float2bfloat16(__ocml_sqrt_f32(__bfloat162float(h)));
1670 }
1671 
1676 __BF16_DEVICE_STATIC__ __hip_bfloat16 htrunc(const __hip_bfloat16 h) {
1677  return __float2bfloat16(__ocml_trunc_f32(__bfloat162float(h)));
1678 }
1679 
1684 __BF16_DEVICE_STATIC__ __hip_bfloat162 h2ceil(const __hip_bfloat162 h) {
1685  __hip_bfloat162_raw hr = h;
1686  return __hip_bfloat162(hceil(__hip_bfloat16_raw{hr.x}), hceil(__hip_bfloat16_raw{hr.y}));
1687 }
1688 
1693 __BF16_DEVICE_STATIC__ __hip_bfloat162 h2cos(const __hip_bfloat162 h) {
1694  __hip_bfloat162_raw hr = h;
1695  return __hip_bfloat162(hcos(__hip_bfloat16_raw{hr.x}), hcos(__hip_bfloat16_raw{hr.y}));
1696 }
1697 
1702 __BF16_DEVICE_STATIC__ __hip_bfloat162 h2exp(const __hip_bfloat162 h) {
1703  __hip_bfloat162_raw hr = h;
1704  return __hip_bfloat162(hexp(__hip_bfloat16_raw{hr.x}), hexp(__hip_bfloat16_raw{hr.y}));
1705 }
1706 
1711 __BF16_DEVICE_STATIC__ __hip_bfloat162 h2exp10(const __hip_bfloat162 h) {
1712  __hip_bfloat162_raw hr = h;
1713  return __hip_bfloat162(hexp10(__hip_bfloat16_raw{hr.x}), hexp10(__hip_bfloat16_raw{hr.y}));
1714 }
1715 
1720 __BF16_DEVICE_STATIC__ __hip_bfloat162 h2exp2(const __hip_bfloat162 h) {
1721  __hip_bfloat162_raw hr = h;
1722  return __hip_bfloat162(hexp2(__hip_bfloat16_raw{hr.x}), hexp2(__hip_bfloat16_raw{hr.y}));
1723 }
1724 
1729 __BF16_DEVICE_STATIC__ __hip_bfloat162 h2floor(const __hip_bfloat162 h) {
1730  __hip_bfloat162_raw hr = h;
1731  return __hip_bfloat162(hfloor(__hip_bfloat16_raw{hr.x}), hfloor(__hip_bfloat16_raw{hr.y}));
1732 }
1733 
1738 __BF16_DEVICE_STATIC__ __hip_bfloat162 h2log(const __hip_bfloat162 h) {
1739  __hip_bfloat162_raw hr = h;
1740  return __hip_bfloat162(hlog(__hip_bfloat16_raw{hr.x}), hlog(__hip_bfloat16_raw{hr.y}));
1741 }
1742 
1747 __BF16_DEVICE_STATIC__ __hip_bfloat162 h2log10(const __hip_bfloat162 h) {
1748  __hip_bfloat162_raw hr = h;
1749  return __hip_bfloat162(hlog10(__hip_bfloat16_raw{hr.x}), hlog10(__hip_bfloat16_raw{hr.y}));
1750 }
1751 
1756 __BF16_DEVICE_STATIC__ __hip_bfloat162 h2log2(const __hip_bfloat162 h) {
1757  __hip_bfloat162_raw hr = h;
1758  return __hip_bfloat162(hlog2(__hip_bfloat16_raw{hr.x}), hlog2(__hip_bfloat16_raw{hr.y}));
1759 }
1760 
1765 __BF16_DEVICE_STATIC__ __hip_bfloat162 h2rcp(const __hip_bfloat162 h) {
1766  __hip_bfloat162_raw hr = h;
1767  return __hip_bfloat162(hrcp(__hip_bfloat16_raw{hr.x}), hrcp(__hip_bfloat16_raw{hr.y}));
1768 }
1769 
1774 __BF16_DEVICE_STATIC__ __hip_bfloat162 h2rint(const __hip_bfloat162 h) {
1775  __hip_bfloat162_raw hr = h;
1776  return __hip_bfloat162(hrint(__hip_bfloat16_raw{hr.x}), hrint(__hip_bfloat16_raw{hr.y}));
1777 }
1778 
1783 __BF16_DEVICE_STATIC__ __hip_bfloat162 h2rsqrt(const __hip_bfloat162 h) {
1784  __hip_bfloat162_raw hr = h;
1785  return __hip_bfloat162(hrsqrt(__hip_bfloat16_raw{hr.x}), hrsqrt(__hip_bfloat16_raw{hr.y}));
1786 }
1787 
1792 __BF16_DEVICE_STATIC__ __hip_bfloat162 h2sin(const __hip_bfloat162 h) {
1793  __hip_bfloat162_raw hr = h;
1794  return __hip_bfloat162(hsin(__hip_bfloat16_raw{hr.x}), hsin(__hip_bfloat16_raw{hr.y}));
1795 }
1796 
1801 __BF16_DEVICE_STATIC__ __hip_bfloat162 h2sqrt(const __hip_bfloat162 h) {
1802  __hip_bfloat162_raw hr = h;
1803  return __hip_bfloat162(hsqrt(__hip_bfloat16_raw{hr.x}), hsqrt(__hip_bfloat16_raw{hr.y}));
1804 }
1805 
1810 __BF16_DEVICE_STATIC__ __hip_bfloat162 h2trunc(const __hip_bfloat162 h) {
1811  __hip_bfloat162_raw hr = h;
1812  return __hip_bfloat162(htrunc(__hip_bfloat16_raw{hr.x}), htrunc(__hip_bfloat16_raw{hr.y}));
1813 }
1814 #endif
Contains declarations for types and functions in device library. Uses int64_t and uint64_t instead of...
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hsub(const __hip_bfloat16 a, const __hip_bfloat16 b)
Subtracts two bfloat16 values.
Definition: amd_hip_bf16.h:681
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator+(const __hip_bfloat16 &l)
Operator to unary+ on a __hip_bfloat16 number.
Definition: amd_hip_bf16.h:835
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 & operator/=(__hip_bfloat16 &l, const __hip_bfloat16 &r)
Operator to divide-assign two __hip_bfloat16 numbers.
Definition: amd_hip_bf16.h:930
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator/(const __hip_bfloat16 &l, const __hip_bfloat16 &r)
Operator to divide two __hip_bfloat16 numbers.
Definition: amd_hip_bf16.h:921
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 & operator*=(__hip_bfloat16 &l, const __hip_bfloat16 &r)
Operator to multiply-assign two __hip_bfloat16 numbers.
Definition: amd_hip_bf16.h:826
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator-(const __hip_bfloat16 &l)
Operator to negate a __hip_bfloat16 number.
Definition: amd_hip_bf16.h:850
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hneg(const __hip_bfloat16 a)
Negate a bfloat16 value.
Definition: amd_hip_bf16.h:715
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 & operator+=(__hip_bfloat16 &l, const __hip_bfloat16 &r)
Operator to add-assign two __hip_bfloat16 numbers.
Definition: amd_hip_bf16.h:903
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hadd(const __hip_bfloat16 a, const __hip_bfloat16 b)
Adds two bfloat16 values.
Definition: amd_hip_bf16.h:673
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator*(const __hip_bfloat16 &l, const __hip_bfloat16 &r)
Operator to multiply two __hip_bfloat16 numbers.
Definition: amd_hip_bf16.h:817
__BF16_DEVICE_STATIC__ __hip_bfloat16 __hfma(const __hip_bfloat16 a, const __hip_bfloat16 b, const __hip_bfloat16 c)
Performs FMA of given bfloat16 values.
Definition: amd_hip_bf16.h:697
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hmul(const __hip_bfloat16 a, const __hip_bfloat16 b)
Multiplies two bfloat16 values.
Definition: amd_hip_bf16.h:707
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator++(__hip_bfloat16 &l, const int)
Operator to post increment a __hip_bfloat16 number.
Definition: amd_hip_bf16.h:865
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __habs(const __hip_bfloat16 a)
Returns absolute of a bfloat16.
Definition: amd_hip_bf16.h:725
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator--(__hip_bfloat16 &l, const int)
Operator to post decrement a __hip_bfloat16 number.
Definition: amd_hip_bf16.h:884
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hdiv(const __hip_bfloat16 a, const __hip_bfloat16 b)
Divides two bfloat16 values.
Definition: amd_hip_bf16.h:689
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 & operator-=(__hip_bfloat16 &l, const __hip_bfloat16 &r)
Operator to subtract-assign two __hip_bfloat16 numbers.
Definition: amd_hip_bf16.h:912
__BF16_HOST_DEVICE_STATIC__ bool __heq(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values.
Definition: amd_hip_bf16.h:1067
__BF16_HOST_DEVICE_STATIC__ bool __hgt(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - greater than.
Definition: amd_hip_bf16.h:1084
__BF16_HOST_DEVICE_STATIC__ bool __hne(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - not equal.
Definition: amd_hip_bf16.h:1116
__BF16_HOST_DEVICE_STATIC__ bool operator==(const __hip_bfloat16 &l, const __hip_bfloat16 &r)
Operator to perform an equal compare on two __hip_bfloat16 numbers.
Definition: amd_hip_bf16.h:1463
__BF16_HOST_DEVICE_STATIC__ bool __hle(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - less than equal.
Definition: amd_hip_bf16.h:1172
__BF16_HOST_DEVICE_STATIC__ bool operator!=(const __hip_bfloat16 &l, const __hip_bfloat16 &r)
Operator to perform a not equal on two __hip_bfloat16 numbers.
Definition: amd_hip_bf16.h:1471
__BF16_HOST_DEVICE_STATIC__ bool operator>(const __hip_bfloat16 &l, const __hip_bfloat16 &r)
Operator to perform a greater than on two __hip_bfloat16 numbers.
Definition: amd_hip_bf16.h:1495
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hmin(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - return min.
Definition: amd_hip_bf16.h:1144
__BF16_HOST_DEVICE_STATIC__ bool __hge(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - greater than equal.
Definition: amd_hip_bf16.h:1100
__BF16_HOST_DEVICE_STATIC__ bool __hleu(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - unordered less than equal.
Definition: amd_hip_bf16.h:1180
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hmax(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - return max.
Definition: amd_hip_bf16.h:1132
__BF16_HOST_DEVICE_STATIC__ bool operator<=(const __hip_bfloat16 &l, const __hip_bfloat16 &r)
Operator to perform a less than equal on two __hip_bfloat16 numbers.
Definition: amd_hip_bf16.h:1487
__BF16_HOST_DEVICE_STATIC__ int __hisinf(const __hip_bfloat16 a)
Checks if number is inf.
Definition: amd_hip_bf16.h:1188
__BF16_HOST_DEVICE_STATIC__ bool __hneu(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - unordered not equal.
Definition: amd_hip_bf16.h:1124
__BF16_HOST_DEVICE_STATIC__ bool __hgtu(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - unordered greater than.
Definition: amd_hip_bf16.h:1092
__BF16_HOST_DEVICE_STATIC__ bool operator<(const __hip_bfloat16 &l, const __hip_bfloat16 &r)
Operator to perform a less than on two __hip_bfloat16 numbers.
Definition: amd_hip_bf16.h:1479
__BF16_HOST_DEVICE_STATIC__ bool __hltu(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - unordered less than.
Definition: amd_hip_bf16.h:1164
__BF16_HOST_DEVICE_STATIC__ bool __hgeu(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - unordered greater than equal.
Definition: amd_hip_bf16.h:1108
__BF16_HOST_DEVICE_STATIC__ bool __hlt(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - less than operator.
Definition: amd_hip_bf16.h:1156
__BF16_HOST_DEVICE_STATIC__ bool operator>=(const __hip_bfloat16 &l, const __hip_bfloat16 &r)
Operator to perform a greater than equal on two __hip_bfloat16 numbers.
Definition: amd_hip_bf16.h:1503
__BF16_HOST_DEVICE_STATIC__ bool __hisnan(const __hip_bfloat16 a)
Checks if number is nan.
Definition: amd_hip_bf16.h:1197
__BF16_HOST_DEVICE_STATIC__ bool __hequ(const __hip_bfloat16 a, const __hip_bfloat16 b)
Compare two bfloat162 values - unordered equal.
Definition: amd_hip_bf16.h:1075
__BF16_HOST_DEVICE_STATIC__ bool __hbgeu2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a >= b - unordered.
Definition: amd_hip_bf16.h:1239
__BF16_HOST_DEVICE_STATIC__ bool __hbgt2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a > b.
Definition: amd_hip_bf16.h:1250
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hmax2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Returns max of two elements.
Definition: amd_hip_bf16.h:1424
__BF16_HOST_DEVICE_STATIC__ bool __hble2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a <= b.
Definition: amd_hip_bf16.h:1272
__BF16_HOST_DEVICE_STATIC__ bool __hblt2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a < b.
Definition: amd_hip_bf16.h:1294
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hmin2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Returns min of two elements.
Definition: amd_hip_bf16.h:1436
__BF16_HOST_DEVICE_STATIC__ bool __hbleu2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a <= b - unordered.
Definition: amd_hip_bf16.h:1283
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hne2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Checks for not equal to.
Definition: amd_hip_bf16.h:1448
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hge2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a >= b, returns 1.0 if greater than equal, otherwise 0.0.
Definition: amd_hip_bf16.h:1354
__BF16_HOST_DEVICE_STATIC__ bool __hbne2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a != b.
Definition: amd_hip_bf16.h:1316
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hlt2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a < b, returns 1.0 if greater than equal, otherwise 0.0.
Definition: amd_hip_bf16.h:1409
__BF16_HOST_DEVICE_STATIC__ bool __hbltu2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a < b - unordered.
Definition: amd_hip_bf16.h:1305
__BF16_HOST_DEVICE_STATIC__ bool __hbequ2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Checks if two numbers are equal - unordered.
Definition: amd_hip_bf16.h:1217
__BF16_HOST_DEVICE_STATIC__ bool __hbeq2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Checks if two numbers are equal.
Definition: amd_hip_bf16.h:1206
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hle2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a <= b, returns 1.0 if greater than equal, otherwise 0.0.
Definition: amd_hip_bf16.h:1394
__BF16_HOST_DEVICE_STATIC__ bool __hbneu2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a != b.
Definition: amd_hip_bf16.h:1328
__BF16_HOST_DEVICE_STATIC__ bool __hbgtu2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a > b - unordered.
Definition: amd_hip_bf16.h:1261
__BF16_HOST_DEVICE_STATIC__ bool __hbge2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a >= b.
Definition: amd_hip_bf16.h:1228
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __heq2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a != b, returns 1.0 if equal, otherwise 0.0.
Definition: amd_hip_bf16.h:1339
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hgt2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Check for a > b, returns 1.0 if greater than equal, otherwise 0.0.
Definition: amd_hip_bf16.h:1369
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hisnan2(const __hip_bfloat162 a)
Check for a is NaN, returns 1.0 if NaN, otherwise 0.0.
Definition: amd_hip_bf16.h:1384
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __habs2(const __hip_bfloat162 a)
Returns absolute of a bfloat162.
Definition: amd_hip_bf16.h:749
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hsub2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Subtracts two bfloat162 values.
Definition: amd_hip_bf16.h:805
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hmul2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Multiplies two bfloat162 values.
Definition: amd_hip_bf16.h:784
__BF16_DEVICE_STATIC__ __hip_bfloat162 __hfma2(const __hip_bfloat162 a, const __hip_bfloat162 b, const __hip_bfloat162 c)
Performs FMA of given bfloat162 values.
Definition: amd_hip_bf16.h:770
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hadd2(const __hip_bfloat162 a, const __hip_bfloat162 b)
Adds two bfloat162 values.
Definition: amd_hip_bf16.h:758
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hneg2(const __hip_bfloat162 a)
Converts a bfloat162 into negative.
Definition: amd_hip_bf16.h:796
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __h2div(const __hip_bfloat162 a, const __hip_bfloat162 b)
Divides bfloat162 values.
Definition: amd_hip_bf16.h:735
__BF16_HOST_DEVICE_STATIC__ float __bfloat162float(__hip_bfloat16 a)
Converts bfloat16 to float.
Definition: amd_hip_bf16.h:493
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __float2bfloat16(float f)
Converts float to bfloat16.
Definition: amd_hip_bf16.h:502
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __bfloat162bfloat162(const __hip_bfloat16 a)
Moves bfloat16 value to bfloat162.
Definition: amd_hip_bf16.h:520
__BF16_HOST_DEVICE_STATIC__ unsigned short int __bfloat16_as_ushort(const __hip_bfloat16 h)
Reinterprets bits in a __hip_bfloat16 as an unsigned signed short integer.
Definition: amd_hip_bf16.h:537
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __ushort_as_bfloat16(const unsigned short int a)
Reinterprets unsigned short int into a bfloat16.
Definition: amd_hip_bf16.h:665
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __low2bfloat16(const __hip_bfloat162 a)
Returns low 16 bits of __hip_bfloat162.
Definition: amd_hip_bf16.h:610
__BF16_HOST_DEVICE_STATIC__ float2 __bfloat1622float2(const __hip_bfloat162 a)
Converts and moves bfloat162 to float2.
Definition: amd_hip_bf16.h:511
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __halves2bfloat162(const __hip_bfloat16 a, const __hip_bfloat16 b)
Combine two __hip_bfloat16 to __hip_bfloat162.
Definition: amd_hip_bf16.h:563
__BF16_HOST_DEVICE_STATIC__ short int __bfloat16_as_short(const __hip_bfloat16 h)
Reinterprets bits in a __hip_bfloat16 as a signed short integer.
Definition: amd_hip_bf16.h:528
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __double2bfloat16(const double a)
Convert double to __hip_bfloat16.
Definition: amd_hip_bf16.h:546
__BF16_HOST_DEVICE_STATIC__ float __low2float(const __hip_bfloat162 a)
Converts low 16 bits of __hip_bfloat162 to float and returns the result.
Definition: amd_hip_bf16.h:628
__BF16_HOST_DEVICE_STATIC__ float __high2float(const __hip_bfloat162 a)
Converts high 16 bits of __hip_bfloat162 to float and returns the result.
Definition: amd_hip_bf16.h:590
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __low2bfloat162(const __hip_bfloat162 a)
Returns low 16 bits of __hip_bfloat162.
Definition: amd_hip_bf16.h:619
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __float22bfloat162_rn(const float2 a)
Convert float2 to __hip_bfloat162.
Definition: amd_hip_bf16.h:555
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __highs2bfloat162(const __hip_bfloat162 a, const __hip_bfloat162 b)
Extracts high 16 bits from each and combines them.
Definition: amd_hip_bf16.h:599
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __lows2bfloat162(const __hip_bfloat162 a, const __hip_bfloat162 b)
Extracts low 16 bits from each and combines them.
Definition: amd_hip_bf16.h:646
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __high2bfloat16(const __hip_bfloat162 a)
Returns high 16 bits of __hip_bfloat162.
Definition: amd_hip_bf16.h:572
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __high2bfloat162(const __hip_bfloat162 a)
Returns high 16 bits of __hip_bfloat162.
Definition: amd_hip_bf16.h:581
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __short_as_bfloat16(const short int a)
Reinterprets short int into a bfloat16.
Definition: amd_hip_bf16.h:657
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __lowhigh2highlow(const __hip_bfloat162 a)
Swaps both halves.
Definition: amd_hip_bf16.h:637
__BF16_DEVICE_STATIC__ __hip_bfloat16 hexp10(const __hip_bfloat16 h)
Calculate exponential 10 of bfloat16.
Definition: amd_hip_bf16.h:1588
__BF16_DEVICE_STATIC__ __hip_bfloat16 hlog(const __hip_bfloat16 h)
Calculate natural log of bfloat16.
Definition: amd_hip_bf16.h:1612
__BF16_DEVICE_STATIC__ __hip_bfloat16 hexp2(const __hip_bfloat16 h)
Calculate exponential 2 of bfloat16.
Definition: amd_hip_bf16.h:1596
__BF16_DEVICE_STATIC__ __hip_bfloat16 hceil(const __hip_bfloat16 h)
Calculate ceil of bfloat16.
Definition: amd_hip_bf16.h:1564
__BF16_DEVICE_STATIC__ __hip_bfloat16 hrcp(const __hip_bfloat16 h)
Calculate reciprocal.
Definition: amd_hip_bf16.h:1636
__BF16_DEVICE_STATIC__ __hip_bfloat16 hsqrt(const __hip_bfloat16 h)
Calculate sqrt of bfloat16.
Definition: amd_hip_bf16.h:1668
__BF16_DEVICE_STATIC__ __hip_bfloat16 hlog10(const __hip_bfloat16 h)
Calculate log 10 of bfloat16.
Definition: amd_hip_bf16.h:1620
__BF16_DEVICE_STATIC__ __hip_bfloat16 hsin(const __hip_bfloat16 h)
Calculate sin of bfloat16.
Definition: amd_hip_bf16.h:1660
__BF16_DEVICE_STATIC__ __hip_bfloat16 hfloor(const __hip_bfloat16 h)
Calculate floor of bfloat16.
Definition: amd_hip_bf16.h:1604
__BF16_DEVICE_STATIC__ __hip_bfloat16 hrint(const __hip_bfloat16 h)
Round to nearest int.
Definition: amd_hip_bf16.h:1644
__BF16_DEVICE_STATIC__ __hip_bfloat16 htrunc(const __hip_bfloat16 h)
Calculate truncate of bfloat16.
Definition: amd_hip_bf16.h:1676
__BF16_DEVICE_STATIC__ __hip_bfloat16 hrsqrt(const __hip_bfloat16 h)
Reciprocal square root.
Definition: amd_hip_bf16.h:1652
__BF16_DEVICE_STATIC__ __hip_bfloat16 hcos(const __hip_bfloat16 h)
Calculate cosine of bfloat16.
Definition: amd_hip_bf16.h:1572
__BF16_DEVICE_STATIC__ __hip_bfloat16 hlog2(const __hip_bfloat16 h)
Calculate log 2 of bfloat16.
Definition: amd_hip_bf16.h:1628
__BF16_DEVICE_STATIC__ __hip_bfloat16 hexp(const __hip_bfloat16 h)
Calculate exponential of bfloat16.
Definition: amd_hip_bf16.h:1580
__BF16_DEVICE_STATIC__ __hip_bfloat162 h2trunc(const __hip_bfloat162 h)
Calculate truncate of bfloat162.
Definition: amd_hip_bf16.h:1810
__BF16_DEVICE_STATIC__ __hip_bfloat162 h2rcp(const __hip_bfloat162 h)
Calculate vector reciprocal.
Definition: amd_hip_bf16.h:1765
__BF16_DEVICE_STATIC__ __hip_bfloat162 h2log(const __hip_bfloat162 h)
Calculate natural log of bfloat162.
Definition: amd_hip_bf16.h:1738
__BF16_DEVICE_STATIC__ __hip_bfloat162 h2exp(const __hip_bfloat162 h)
Calculate exponential of bfloat162.
Definition: amd_hip_bf16.h:1702
__BF16_DEVICE_STATIC__ __hip_bfloat162 h2cos(const __hip_bfloat162 h)
Calculate cosine of bfloat162.
Definition: amd_hip_bf16.h:1693
__BF16_DEVICE_STATIC__ __hip_bfloat162 h2sin(const __hip_bfloat162 h)
Calculate sin of bfloat162.
Definition: amd_hip_bf16.h:1792
__BF16_DEVICE_STATIC__ __hip_bfloat162 h2log2(const __hip_bfloat162 h)
Calculate log 2 of bfloat162.
Definition: amd_hip_bf16.h:1756
__BF16_DEVICE_STATIC__ __hip_bfloat162 h2ceil(const __hip_bfloat162 h)
Calculate ceil of bfloat162.
Definition: amd_hip_bf16.h:1684
__BF16_DEVICE_STATIC__ __hip_bfloat162 h2floor(const __hip_bfloat162 h)
Calculate floor of bfloat162.
Definition: amd_hip_bf16.h:1729
__BF16_DEVICE_STATIC__ __hip_bfloat162 h2exp10(const __hip_bfloat162 h)
Calculate exponential 10 of bfloat162.
Definition: amd_hip_bf16.h:1711
__BF16_DEVICE_STATIC__ __hip_bfloat162 h2exp2(const __hip_bfloat162 h)
Calculate exponential 2 of bfloat162.
Definition: amd_hip_bf16.h:1720
__BF16_DEVICE_STATIC__ __hip_bfloat162 h2log10(const __hip_bfloat162 h)
Calculate log 10 of bfloat162.
Definition: amd_hip_bf16.h:1747
__BF16_DEVICE_STATIC__ __hip_bfloat162 h2rsqrt(const __hip_bfloat162 h)
Calculate vector reciprocal square root.
Definition: amd_hip_bf16.h:1783
__BF16_DEVICE_STATIC__ __hip_bfloat162 h2rint(const __hip_bfloat162 h)
Calculate vector round to nearest int.
Definition: amd_hip_bf16.h:1774
__BF16_DEVICE_STATIC__ __hip_bfloat162 h2sqrt(const __hip_bfloat162 h)
Calculate sqrt of bfloat162.
Definition: amd_hip_bf16.h:1801
struct __attribute__((aligned(2)))
represents raw bfloat16 type
Definition: amd_hip_bf16.h:153
Definition: amd_hip_vector_types.h:2035