HIP: Heterogenous-computing Interface for Portability
amd_hip_unsafe_atomics.h
1 /*
2 Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
3 
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 THE SOFTWARE.
21 */
22 
23 #pragma once
24 
25 #ifdef __cplusplus
26 
53 __device__ inline float unsafeAtomicAdd(float* addr, float value) {
54 #if defined(__gfx90a__) && \
55  __has_builtin(__builtin_amdgcn_is_shared) && \
56  __has_builtin(__builtin_amdgcn_is_private) && \
57  __has_builtin(__builtin_amdgcn_ds_atomic_fadd_f32) && \
58  __has_builtin(__builtin_amdgcn_global_atomic_fadd_f32)
59  if (__builtin_amdgcn_is_shared(
60  (const __attribute__((address_space(0))) void*)addr))
61  return __builtin_amdgcn_ds_atomic_fadd_f32(addr, value);
62  else if (__builtin_amdgcn_is_private(
63  (const __attribute__((address_space(0))) void*)addr)) {
64  float temp = *addr;
65  *addr = temp + value;
66  return temp;
67  }
68  else
69  return __builtin_amdgcn_global_atomic_fadd_f32(addr, value);
70 #elif __has_builtin(__hip_atomic_fetch_add)
71  return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
72 #else
73  return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED);
74 #endif
75 }
76 
91 __device__ inline float unsafeAtomicMax(float* addr, float val) {
92  #if __has_builtin(__hip_atomic_load) && \
93  __has_builtin(__hip_atomic_compare_exchange_strong)
94  float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
95  bool done = false;
96  while (!done && value < val) {
97  done = __hip_atomic_compare_exchange_strong(addr, &value, val,
98  __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
99  }
100  return value;
101  #else
102  unsigned int *uaddr = (unsigned int *)addr;
103  unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
104  bool done = false;
105  while (!done && __uint_as_float(value) < val) {
106  done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
107  __ATOMIC_RELAXED, __ATOMIC_RELAXED);
108  }
109  return __uint_as_float(value);
110  #endif
111 }
112 
127 __device__ inline float unsafeAtomicMin(float* addr, float val) {
128  #if __has_builtin(__hip_atomic_load) && \
129  __has_builtin(__hip_atomic_compare_exchange_strong)
130  float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
131  bool done = false;
132  while (!done && value > val) {
133  done = __hip_atomic_compare_exchange_strong(addr, &value, val,
134  __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
135  }
136  return value;
137  #else
138  unsigned int *uaddr = (unsigned int *)addr;
139  unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
140  bool done = false;
141  while (!done && __uint_as_float(value) > val) {
142  done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
143  __ATOMIC_RELAXED, __ATOMIC_RELAXED);
144  }
145  return __uint_as_float(value);
146  #endif
147 }
148 
175 __device__ inline double unsafeAtomicAdd(double* addr, double value) {
176 #if defined(__gfx90a__) && __has_builtin(__builtin_amdgcn_flat_atomic_fadd_f64)
177  return __builtin_amdgcn_flat_atomic_fadd_f64(addr, value);
178 #elif defined (__hip_atomic_fetch_add)
179  return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
180 #else
181  return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED);
182 #endif
183 }
184 
211 __device__ inline double unsafeAtomicMax(double* addr, double val) {
212 #if (defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) && \
213  __has_builtin(__builtin_amdgcn_flat_atomic_fmax_f64)
214  return __builtin_amdgcn_flat_atomic_fmax_f64(addr, val);
215 #else
216  #if __has_builtin(__hip_atomic_load) && \
217  __has_builtin(__hip_atomic_compare_exchange_strong)
218  double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
219  bool done = false;
220  while (!done && value < val) {
221  done = __hip_atomic_compare_exchange_strong(addr, &value, val,
222  __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
223  }
224  return value;
225  #else
226  unsigned long long *uaddr = (unsigned long long *)addr;
227  unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
228  bool done = false;
229  while (!done && __longlong_as_double(value) < val) {
230  done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
231  __ATOMIC_RELAXED, __ATOMIC_RELAXED);
232  }
233  return __longlong_as_double(value);
234  #endif
235 #endif
236 }
237 
264 __device__ inline double unsafeAtomicMin(double* addr, double val) {
265 #if (defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) && \
266  __has_builtin(__builtin_amdgcn_flat_atomic_fmin_f64)
267  return __builtin_amdgcn_flat_atomic_fmin_f64(addr, val);
268 #else
269  #if __has_builtin(__hip_atomic_load) && \
270  __has_builtin(__hip_atomic_compare_exchange_strong)
271  double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
272  bool done = false;
273  while (!done && value > val) {
274  done = __hip_atomic_compare_exchange_strong(addr, &value, val,
275  __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
276  }
277  return value;
278  #else
279  unsigned long long *uaddr = (unsigned long long *)addr;
280  unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
281  bool done = false;
282  while (!done && __longlong_as_double(value) > val) {
283  done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
284  __ATOMIC_RELAXED, __ATOMIC_RELAXED);
285  }
286  return __longlong_as_double(value);
287  #endif
288 #endif
289 }
290 
305 __device__ inline float safeAtomicAdd(float* addr, float value) {
306 #if defined(__gfx908__) || defined(__gfx941__) \
307  || ((defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx942__)) \
308  && !__has_builtin(__hip_atomic_fetch_add))
309  // On gfx908, we can generate unsafe FP32 atomic add that does not follow all
310  // IEEE rules when -munsafe-fp-atomics is passed. Do a CAS loop emulation instead.
311  // On gfx941, we can generate unsafe FP32 atomic add that may not always happen atomically,
312  // so we need to force a CAS loop emulation to ensure safety.
313  // On gfx90a, gfx940 and gfx942 if we do not have the __hip_atomic_fetch_add builtin, we
314  // need to force a CAS loop here.
315  float old_val;
316 #if __has_builtin(__hip_atomic_load)
317  old_val = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
318 #else // !__has_builtin(__hip_atomic_load)
319  old_val = __uint_as_float(__atomic_load_n(reinterpret_cast<unsigned int*>(addr), __ATOMIC_RELAXED));
320 #endif // __has_builtin(__hip_atomic_load)
321  float expected, temp;
322  do {
323  temp = expected = old_val;
324 #if __has_builtin(__hip_atomic_compare_exchange_strong)
325  __hip_atomic_compare_exchange_strong(addr, &expected, old_val + value, __ATOMIC_RELAXED,
326  __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
327 #else // !__has_builtin(__hip_atomic_compare_exchange_strong)
328  __atomic_compare_exchange_n(addr, &expected, old_val + value, false,
329  __ATOMIC_RELAXED, __ATOMIC_RELAXED);
330 #endif // __has_builtin(__hip_atomic_compare_exchange_strong)
331  old_val = expected;
332  } while (__float_as_uint(temp) != __float_as_uint(old_val));
333  return old_val;
334 #elif defined(__gfx90a__)
335  // On gfx90a, with the __hip_atomic_fetch_add builtin, relaxed system-scope
336  // atomics will produce safe CAS loops, but are otherwise not different than
337  // agent-scope atomics. This logic is only applicable for gfx90a, and should
338  // not be assumed on other architectures.
339  return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
340 #elif __has_builtin(__hip_atomic_fetch_add)
341  return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
342 #else
343  return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED);
344 #endif
345 }
346 
361 __device__ inline float safeAtomicMax(float* addr, float val) {
362  #if __has_builtin(__hip_atomic_load) && \
363  __has_builtin(__hip_atomic_compare_exchange_strong)
364  float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
365  bool done = false;
366  while (!done && value < val) {
367  done = __hip_atomic_compare_exchange_strong(addr, &value, val,
368  __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
369  }
370  return value;
371  #else
372  unsigned int *uaddr = (unsigned int *)addr;
373  unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
374  bool done = false;
375  while (!done && __uint_as_float(value) < val) {
376  done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
377  __ATOMIC_RELAXED, __ATOMIC_RELAXED);
378  }
379  return __uint_as_float(value);
380  #endif
381 }
382 
397 __device__ inline float safeAtomicMin(float* addr, float val) {
398  #if __has_builtin(__hip_atomic_load) && \
399  __has_builtin(__hip_atomic_compare_exchange_strong)
400  float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
401  bool done = false;
402  while (!done && value > val) {
403  done = __hip_atomic_compare_exchange_strong(addr, &value, val,
404  __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
405  }
406  return value;
407  #else
408  unsigned int *uaddr = (unsigned int *)addr;
409  unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
410  bool done = false;
411  while (!done && __uint_as_float(value) > val) {
412  done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
413  __ATOMIC_RELAXED, __ATOMIC_RELAXED);
414  }
415  return __uint_as_float(value);
416  #endif
417 }
418 
433 __device__ inline double safeAtomicAdd(double* addr, double value) {
434 #if defined(__gfx90a__) && __has_builtin(__hip_atomic_fetch_add)
435  // On gfx90a, with the __hip_atomic_fetch_add builtin, relaxed system-scope
436  // atomics will produce safe CAS loops, but are otherwise not different than
437  // agent-scope atomics. This logic is only applicable for gfx90a, and should
438  // not be assumed on other architectures.
439  return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
440 #elif defined(__gfx90a__)
441  // On gfx90a, if we do not have the __hip_atomic_fetch_add builtin, we need to
442  // force a CAS loop here.
443  double old_val;
444 #if __has_builtin(__hip_atomic_load)
445  old_val = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
446 #else // !__has_builtin(__hip_atomic_load)
447  old_val = __longlong_as_double(__atomic_load_n(reinterpret_cast<unsigned long long*>(addr), __ATOMIC_RELAXED));
448 #endif // __has_builtin(__hip_atomic_load)
449  double expected, temp;
450  do {
451  temp = expected = old_val;
452 #if __has_builtin(__hip_atomic_compare_exchange_strong)
453  __hip_atomic_compare_exchange_strong(addr, &expected, old_val + value, __ATOMIC_RELAXED,
454  __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
455 #else // !__has_builtin(__hip_atomic_compare_exchange_strong)
456  __atomic_compare_exchange_n(addr, &expected, old_val + value, false,
457  __ATOMIC_RELAXED, __ATOMIC_RELAXED);
458 #endif // __has_builtin(__hip_atomic_compare_exchange_strong)
459  old_val = expected;
460  } while (__double_as_longlong(temp) != __double_as_longlong(old_val));
461  return old_val;
462 #else // !defined(__gfx90a__)
463 #if __has_builtin(__hip_atomic_fetch_add)
464  return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
465 #else // !__has_builtin(__hip_atomic_fetch_add)
466  return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED);
467 #endif // __has_builtin(__hip_atomic_fetch_add)
468 #endif
469 }
470 
485 __device__ inline double safeAtomicMax(double* addr, double val) {
486  #if __has_builtin(__builtin_amdgcn_is_private)
487  if (__builtin_amdgcn_is_private(
488  (const __attribute__((address_space(0))) void*)addr)) {
489  double old = *addr;
490  *addr = __builtin_fmax(old, val);
491  return old;
492  } else {
493  #endif
494  #if __has_builtin(__hip_atomic_load) && \
495  __has_builtin(__hip_atomic_compare_exchange_strong)
496  double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
497  bool done = false;
498  while (!done && value < val) {
499  done = __hip_atomic_compare_exchange_strong(addr, &value, val,
500  __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
501  }
502  return value;
503  #else
504  unsigned long long *uaddr = (unsigned long long *)addr;
505  unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
506  bool done = false;
507  while (!done && __longlong_as_double(value) < val) {
508  done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
509  __ATOMIC_RELAXED, __ATOMIC_RELAXED);
510  }
511  return __longlong_as_double(value);
512  #endif
513  #if __has_builtin(__builtin_amdgcn_is_private)
514  }
515  #endif
516 }
517 
532 __device__ inline double safeAtomicMin(double* addr, double val) {
533  #if __has_builtin(__builtin_amdgcn_is_private)
534  if (__builtin_amdgcn_is_private(
535  (const __attribute__((address_space(0))) void*)addr)) {
536  double old = *addr;
537  *addr = __builtin_fmin(old, val);
538  return old;
539  } else {
540  #endif
541  #if __has_builtin(__hip_atomic_load) && \
542  __has_builtin(__hip_atomic_compare_exchange_strong)
543  double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
544  bool done = false;
545  while (!done && value > val) {
546  done = __hip_atomic_compare_exchange_strong(addr, &value, val,
547  __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
548  }
549  return value;
550  #else
551  unsigned long long *uaddr = (unsigned long long *)addr;
552  unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
553  bool done = false;
554  while (!done && __longlong_as_double(value) > val) {
555  done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
556  __ATOMIC_RELAXED, __ATOMIC_RELAXED);
557  }
558  return __longlong_as_double(value);
559  #endif
560  #if __has_builtin(__builtin_amdgcn_is_private)
561  }
562  #endif
563 }
564 
565 #endif
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Definition: hip_fp16_math_fwd.h:57