HIP: Heterogenous-computing Interface for Portability
amd_hip_atomic.h
1 /*
2 Copyright (c) 2015 - Present Advanced Micro Devices, Inc. All rights reserved.
3 
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 THE SOFTWARE.
21 */
22 
23 #pragma once
24 
25 #if !defined(__HIPCC_RTC__)
26 #include "amd_device_functions.h"
27 #endif
28 
29 #if __has_builtin(__hip_atomic_compare_exchange_strong)
30 
31 template<bool B, typename T, typename F> struct Cond_t;
32 
33 template<typename T, typename F> struct Cond_t<true, T, F> { using type = T; };
34 template<typename T, typename F> struct Cond_t<false, T, F> { using type = F; };
35 
36 #if !__HIP_DEVICE_COMPILE__
37 //TODO: Remove this after compiler pre-defines the following Macros.
38 #define __HIP_MEMORY_SCOPE_SINGLETHREAD 1
39 #define __HIP_MEMORY_SCOPE_WAVEFRONT 2
40 #define __HIP_MEMORY_SCOPE_WORKGROUP 3
41 #define __HIP_MEMORY_SCOPE_AGENT 4
42 #define __HIP_MEMORY_SCOPE_SYSTEM 5
43 #endif
44 
45 #if !defined(__HIPCC_RTC__)
46 #include "amd_hip_unsafe_atomics.h"
47 #endif
48 
49 // Atomic expanders
50 template<
51  int mem_order = __ATOMIC_SEQ_CST,
52  int mem_scope= __HIP_MEMORY_SCOPE_SYSTEM,
53  typename T,
54  typename Op,
55  typename F>
56 inline
57 __attribute__((always_inline, device))
58 T hip_cas_expander(T* p, T x, Op op, F f) noexcept
59 {
60  using FP = __attribute__((address_space(0))) const void*;
61 
62  __device__
63  extern bool is_shared_workaround(FP) asm("llvm.amdgcn.is.shared");
64 
65  if (is_shared_workaround((FP)p))
66  return f();
67 
68  using U = typename Cond_t<
69  sizeof(T) == sizeof(unsigned int), unsigned int, unsigned long long>::type;
70 
71  auto q = reinterpret_cast<U*>(p);
72 
73  U tmp0{__hip_atomic_load(q, mem_order, mem_scope)};
74  U tmp1;
75  do {
76  tmp1 = tmp0;
77 
78  op(reinterpret_cast<T&>(tmp1), x);
79  } while (!__hip_atomic_compare_exchange_strong(q, &tmp0, tmp1, mem_order,
80  mem_order, mem_scope));
81 
82  return reinterpret_cast<const T&>(tmp0);
83 }
84 
85 template<
86  int mem_order = __ATOMIC_SEQ_CST,
87  int mem_scope= __HIP_MEMORY_SCOPE_SYSTEM,
88  typename T,
89  typename Cmp,
90  typename F>
91 inline
92 __attribute__((always_inline, device))
93 T hip_cas_extrema_expander(T* p, T x, Cmp cmp, F f) noexcept
94 {
95  using FP = __attribute__((address_space(0))) const void*;
96 
97  __device__
98  extern bool is_shared_workaround(FP) asm("llvm.amdgcn.is.shared");
99 
100  if (is_shared_workaround((FP)p))
101  return f();
102 
103  using U = typename Cond_t<
104  sizeof(T) == sizeof(unsigned int), unsigned int, unsigned long long>::type;
105 
106  auto q = reinterpret_cast<U*>(p);
107 
108  U tmp{__hip_atomic_load(q, mem_order, mem_scope)};
109  while (cmp(x, reinterpret_cast<const T&>(tmp)) &&
110  !__hip_atomic_compare_exchange_strong(q, &tmp, x, mem_order, mem_order,
111  mem_scope));
112 
113  return reinterpret_cast<const T&>(tmp);
114 }
115 
116 __device__
117 inline
118 int atomicCAS(int* address, int compare, int val) {
119  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
120  __HIP_MEMORY_SCOPE_AGENT);
121  return compare;
122 }
123 
124 __device__
125 inline
126 int atomicCAS_system(int* address, int compare, int val) {
127  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
128  __HIP_MEMORY_SCOPE_SYSTEM);
129  return compare;
130 }
131 
132 __device__
133 inline
134 unsigned int atomicCAS(unsigned int* address, unsigned int compare, unsigned int val) {
135  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
136  __HIP_MEMORY_SCOPE_AGENT);
137  return compare;
138 }
139 
140 __device__
141 inline
142 unsigned int atomicCAS_system(unsigned int* address, unsigned int compare, unsigned int val) {
143  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
144  __HIP_MEMORY_SCOPE_SYSTEM);
145  return compare;
146 }
147 
148 __device__
149 inline
150 unsigned long atomicCAS(unsigned long* address, unsigned long compare, unsigned long val) {
151  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
152  __HIP_MEMORY_SCOPE_AGENT);
153  return compare;
154 }
155 
156 __device__
157 inline
158 unsigned long atomicCAS_system(unsigned long* address, unsigned long compare, unsigned long val) {
159  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
160  __HIP_MEMORY_SCOPE_SYSTEM);
161  return compare;
162 }
163 
164 __device__
165 inline
166 unsigned long long atomicCAS(unsigned long long* address, unsigned long long compare,
167  unsigned long long val) {
168  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
169  __HIP_MEMORY_SCOPE_AGENT);
170  return compare;
171 }
172 
173 __device__
174 inline
175 unsigned long long atomicCAS_system(unsigned long long* address, unsigned long long compare,
176  unsigned long long val) {
177  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
178  __HIP_MEMORY_SCOPE_SYSTEM);
179  return compare;
180 }
181 
182 __device__
183 inline
184 float atomicCAS(float* address, float compare, float val) {
185  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
186  __HIP_MEMORY_SCOPE_AGENT);
187  return compare;
188 }
189 
190 __device__
191 inline
192 float atomicCAS_system(float* address, float compare, float val) {
193  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
194  __HIP_MEMORY_SCOPE_SYSTEM);
195  return compare;
196 }
197 
198 __device__
199 inline
200 double atomicCAS(double* address, double compare, double val) {
201  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
202  __HIP_MEMORY_SCOPE_AGENT);
203  return compare;
204 }
205 
206 __device__
207 inline
208 double atomicCAS_system(double* address, double compare, double val) {
209  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
210  __HIP_MEMORY_SCOPE_SYSTEM);
211  return compare;
212 }
213 
214 __device__
215 inline
216 int atomicAdd(int* address, int val) {
217  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
218 }
219 
220 __device__
221 inline
222 int atomicAdd_system(int* address, int val) {
223  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
224 }
225 
226 __device__
227 inline
228 unsigned int atomicAdd(unsigned int* address, unsigned int val) {
229  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
230 }
231 
232 __device__
233 inline
234 unsigned int atomicAdd_system(unsigned int* address, unsigned int val) {
235  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
236 }
237 
238 __device__
239 inline
240 unsigned long atomicAdd(unsigned long* address, unsigned long val) {
241  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
242 }
243 
244 __device__
245 inline
246 unsigned long atomicAdd_system(unsigned long* address, unsigned long val) {
247  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
248 }
249 
250 __device__
251 inline
252 unsigned long long atomicAdd(unsigned long long* address, unsigned long long val) {
253  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
254 }
255 
256 __device__
257 inline
258 unsigned long long atomicAdd_system(unsigned long long* address, unsigned long long val) {
259  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
260 }
261 
262 __device__
263 inline
264 float atomicAdd(float* address, float val) {
265 #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
266  return unsafeAtomicAdd(address, val);
267 #else
268  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
269 #endif
270 }
271 
272 __device__
273 inline
274 float atomicAdd_system(float* address, float val) {
275  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
276 }
277 
278 #if !defined(__HIPCC_RTC__)
279 DEPRECATED("use atomicAdd instead")
280 #endif // !defined(__HIPCC_RTC__)
281 __device__
282 inline
283 void atomicAddNoRet(float* address, float val)
284 {
285  __ockl_atomic_add_noret_f32(address, val);
286 }
287 
288 __device__
289 inline
290 double atomicAdd(double* address, double val) {
291 #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
292  return unsafeAtomicAdd(address, val);
293 #else
294  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
295 #endif
296 }
297 
298 __device__
299 inline
300 double atomicAdd_system(double* address, double val) {
301  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
302 }
303 
304 __device__
305 inline
306 int atomicSub(int* address, int val) {
307  return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
308 }
309 
310 __device__
311 inline
312 int atomicSub_system(int* address, int val) {
313  return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
314 }
315 
316 __device__
317 inline
318 unsigned int atomicSub(unsigned int* address, unsigned int val) {
319  return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
320 }
321 
322 __device__
323 inline
324 unsigned int atomicSub_system(unsigned int* address, unsigned int val) {
325  return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
326 }
327 
328 __device__
329 inline
330 unsigned long atomicSub(unsigned long* address, unsigned long val) {
331  return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
332 }
333 
334 __device__
335 inline
336 unsigned long atomicSub_system(unsigned long* address, unsigned long val) {
337  return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
338 }
339 
340 __device__
341 inline
342 unsigned long long atomicSub(unsigned long long* address, unsigned long long val) {
343  return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
344 }
345 
346 __device__
347 inline
348 unsigned long long atomicSub_system(unsigned long long* address, unsigned long long val) {
349  return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
350 }
351 
352 __device__
353 inline
354 float atomicSub(float* address, float val) {
355 #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
356  return unsafeAtomicAdd(address, -val);
357 #else
358  return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
359 #endif
360 }
361 
362 __device__
363 inline
364 float atomicSub_system(float* address, float val) {
365  return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
366 }
367 
368 __device__
369 inline
370 double atomicSub(double* address, double val) {
371 #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
372  return unsafeAtomicAdd(address, -val);
373 #else
374  return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
375 #endif
376 }
377 
378 __device__
379 inline
380 double atomicSub_system(double* address, double val) {
381  return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
382 }
383 
384 __device__
385 inline
386 int atomicExch(int* address, int val) {
387  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
388 }
389 
390 __device__
391 inline
392 int atomicExch_system(int* address, int val) {
393  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
394 }
395 
396 __device__
397 inline
398 unsigned int atomicExch(unsigned int* address, unsigned int val) {
399  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
400 }
401 
402 __device__
403 inline
404 unsigned int atomicExch_system(unsigned int* address, unsigned int val) {
405  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
406 }
407 
408 __device__
409 inline
410 unsigned long atomicExch(unsigned long* address, unsigned long val) {
411  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
412 }
413 
414 __device__
415 inline
416 unsigned long atomicExch_system(unsigned long* address, unsigned long val) {
417  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
418 }
419 
420 __device__
421 inline
422 unsigned long long atomicExch(unsigned long long* address, unsigned long long val) {
423  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
424 }
425 
426 __device__
427 inline
428 unsigned long long atomicExch_system(unsigned long long* address, unsigned long long val) {
429  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
430 }
431 
432 __device__
433 inline
434 float atomicExch(float* address, float val) {
435  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
436 }
437 
438 __device__
439 inline
440 float atomicExch_system(float* address, float val) {
441  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
442 }
443 
444 __device__
445 inline
446 double atomicExch(double* address, double val) {
447  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
448 }
449 
450 __device__
451 inline
452 double atomicExch_system(double* address, double val) {
453  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
454 }
455 
456 __device__
457 inline
458 int atomicMin(int* address, int val) {
459 #if defined(__gfx941__)
460  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
461  address, val, [](int x, int y) { return x < y; }, [=]() {
462  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
463  __HIP_MEMORY_SCOPE_AGENT);
464  });
465 #else
466  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
467 #endif // __gfx941__
468 }
469 
470 __device__
471 inline
472 int atomicMin_system(int* address, int val) {
473 #if defined(__gfx941__)
474  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
475  address, val, [](int x, int y) { return x < y; }, [=]() {
476  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
477  __HIP_MEMORY_SCOPE_SYSTEM);
478  });
479 #else
480  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
481 #endif // __gfx941__
482 }
483 
484 __device__
485 inline
486 unsigned int atomicMin(unsigned int* address, unsigned int val) {
487 #if defined(__gfx941__)
488  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
489  address, val, [](unsigned int x, unsigned int y) { return x < y; }, [=]() {
490  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
491  __HIP_MEMORY_SCOPE_AGENT);
492  });
493 #else
494  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
495 #endif // __gfx941__
496 
497 }
498 
499 __device__
500 inline
501 unsigned int atomicMin_system(unsigned int* address, unsigned int val) {
502 #if defined(__gfx941__)
503  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
504  address, val, [](unsigned int x, unsigned int y) { return x < y; }, [=]() {
505  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
506  __HIP_MEMORY_SCOPE_SYSTEM);
507  });
508 #else
509  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
510 #endif // __gfx941__
511 }
512 
513 __device__
514 inline
515 unsigned long long atomicMin(unsigned long* address, unsigned long val) {
516 #if defined(__gfx941__)
517  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
518  address,
519  val,
520  [](unsigned long x, unsigned long y) { return x < y; },
521  [=]() {
522  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
523  __HIP_MEMORY_SCOPE_AGENT);
524  });
525 #else
526  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
527 #endif // __gfx941__
528 }
529 
530 __device__
531 inline
532 unsigned long atomicMin_system(unsigned long* address, unsigned long val) {
533 #if defined(__gfx941__)
534  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
535  address,
536  val,
537  [](unsigned long x, unsigned long y) { return x < y; },
538  [=]() {
539  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
540  __HIP_MEMORY_SCOPE_SYSTEM);
541  });
542 #else
543  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
544 #endif // __gfx941__
545 }
546 
547 __device__
548 inline
549 unsigned long long atomicMin(unsigned long long* address, unsigned long long val) {
550 #if defined(__gfx941__)
551  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
552  address,
553  val,
554  [](unsigned long long x, unsigned long long y) { return x < y; },
555  [=]() {
556  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
557  __HIP_MEMORY_SCOPE_AGENT);
558  });
559 #else
560  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
561 #endif // __gfx941__
562 }
563 
564 __device__
565 inline
566 unsigned long long atomicMin_system(unsigned long long* address, unsigned long long val) {
567 #if defined(__gfx941__)
568  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
569  address,
570  val,
571  [](unsigned long long x, unsigned long long y) { return x < y; },
572  [=]() {
573  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
574  __HIP_MEMORY_SCOPE_SYSTEM);
575  });
576 #else
577  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
578 #endif // __gfx941__
579 }
580 
581 __device__
582 inline
583 long long atomicMin(long long* address, long long val) {
584 #if defined(__gfx941__)
585  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
586  address, val, [](long long x, long long y) { return x < y; },
587  [=]() {
588  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
589  });
590 #else
591  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
592 #endif // __gfx941__
593 }
594 
595 __device__
596 inline
597 long long atomicMin_system(long long* address, long long val) {
598 #if defined(__gfx941__)
599  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
600  address, val, [](long long x, long long y) { return x < y; },
601  [=]() {
602  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
603  });
604 #else
605  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
606 #endif // __gfx941__
607 }
608 
609 __device__
610 inline
611 float atomicMin(float* addr, float val) {
612 #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
613  return unsafeAtomicMin(addr, val);
614 #else
615  typedef union u_hold {
616  float a;
617  unsigned int b;
618  } u_hold_t;
619  u_hold_t u{val};
620  bool neg_zero = 0x80000000U == u.b;
621  #if __has_builtin(__hip_atomic_load) && \
622  __has_builtin(__hip_atomic_compare_exchange_strong)
623  float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
624  bool done = false;
625  while (!done && (value > val || (neg_zero && value == 0.0f))) {
626  done = __hip_atomic_compare_exchange_strong(addr, &value, val,
627  __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
628  }
629  return value;
630  #else
631  unsigned int *uaddr = (unsigned int *)addr;
632  unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
633  bool done = false;
634  while (!done && (__uint_as_float(value) > val || (neg_zero && __uint_as_float(value) == 0.0f))) {
635  done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
636  __ATOMIC_RELAXED, __ATOMIC_RELAXED);
637  }
638  return __uint_as_float(value);
639  #endif
640 #endif
641 }
642 
643 __device__
644 inline
645 float atomicMin_system(float* address, float val) {
646  unsigned int* uaddr { reinterpret_cast<unsigned int*>(address) };
647  #if __has_builtin(__hip_atomic_load)
648  unsigned int tmp {__hip_atomic_load(uaddr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM)};
649  #else
650  unsigned int tmp {__atomic_load_n(uaddr, __ATOMIC_RELAXED)};
651  #endif
652  float value = __uint_as_float(tmp);
653 
654  while (val < value) {
655  value = atomicCAS_system(address, value, val);
656  }
657 
658  return value;
659 }
660 
661 __device__
662 inline
663 double atomicMin(double* addr, double val) {
664 #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
665  return unsafeAtomicMin(addr, val);
666 #else
667  typedef union u_hold {
668  double a;
669  unsigned long long b;
670  } u_hold_t;
671  u_hold_t u{val};
672  bool neg_zero = 0x8000000000000000ULL == u.b;
673  #if __has_builtin(__hip_atomic_load) && \
674  __has_builtin(__hip_atomic_compare_exchange_strong)
675  double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
676  bool done = false;
677  while (!done && (value > val || (neg_zero && value == 0.0))) {
678  done = __hip_atomic_compare_exchange_strong(addr, &value, val,
679  __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
680  }
681  return value;
682  #else
683  unsigned long long *uaddr = (unsigned long long *)addr;
684  unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
685  bool done = false;
686  while (!done &&
687  (__longlong_as_double(value) > val || (neg_zero && __longlong_as_double(value) == 0.0))) {
688  done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
689  __ATOMIC_RELAXED, __ATOMIC_RELAXED);
690  }
691  return __longlong_as_double(value);
692  #endif
693 #endif
694 }
695 
696 __device__
697 inline
698 double atomicMin_system(double* address, double val) {
699  unsigned long long* uaddr { reinterpret_cast<unsigned long long*>(address) };
700  #if __has_builtin(__hip_atomic_load)
701  unsigned long long tmp {__hip_atomic_load(uaddr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM)};
702  #else
703  unsigned long long tmp {__atomic_load_n(uaddr, __ATOMIC_RELAXED)};
704  #endif
705  double value = __longlong_as_double(tmp);
706 
707  while (val < value) {
708  value = atomicCAS_system(address, value, val);
709  }
710 
711  return value;
712 }
713 
714 __device__
715 inline
716 int atomicMax(int* address, int val) {
717 #if defined(__gfx941__)
718  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
719  address, val, [](int x, int y) { return y < x; }, [=]() {
720  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
721  __HIP_MEMORY_SCOPE_AGENT);
722  });
723 #else
724  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
725 #endif // __gfx941__
726 }
727 
728 __device__
729 inline
730 int atomicMax_system(int* address, int val) {
731 #if defined(__gfx941__)
732  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
733  address, val, [](int x, int y) { return y < x; }, [=]() {
734  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
735  __HIP_MEMORY_SCOPE_SYSTEM);
736  });
737 #else
738  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
739 #endif // __gfx941__
740 }
741 
742 __device__
743 inline
744 unsigned int atomicMax(unsigned int* address, unsigned int val) {
745 #if defined(__gfx941__)
746  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
747  address, val, [](unsigned int x, unsigned int y) { return y < x; }, [=]() {
748  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
749  __HIP_MEMORY_SCOPE_AGENT);
750  });
751 #else
752  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
753 #endif // __gfx941__
754 }
755 
756 __device__
757 inline
758 unsigned int atomicMax_system(unsigned int* address, unsigned int val) {
759 #if defined(__gfx941__)
760  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
761  address, val, [](unsigned int x, unsigned int y) { return y < x; }, [=]() {
762  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
763  __HIP_MEMORY_SCOPE_SYSTEM);
764  });
765 #else
766  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
767 #endif // __gfx941__
768 }
769 
770 __device__
771 inline
772 unsigned long atomicMax(unsigned long* address, unsigned long val) {
773 #if defined(__gfx941__)
774  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
775  address,
776  val,
777  [](unsigned long x, unsigned long y) { return y < x; },
778  [=]() {
779  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
780  __HIP_MEMORY_SCOPE_AGENT);
781  });
782 #else
783  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
784 #endif // __gfx941__
785 }
786 
787 __device__
788 inline
789 unsigned long atomicMax_system(unsigned long* address, unsigned long val) {
790 #if defined(__gfx941__)
791  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
792  address,
793  val,
794  [](unsigned long x, unsigned long y) { return y < x; },
795  [=]() {
796  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
797  __HIP_MEMORY_SCOPE_SYSTEM);
798  });
799 #else
800  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
801 #endif // __gfx941__
802 }
803 
804 __device__
805 inline
806 unsigned long long atomicMax(unsigned long long* address, unsigned long long val) {
807 #if defined(__gfx941__)
808  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
809  address,
810  val,
811  [](unsigned long long x, unsigned long long y) { return y < x; },
812  [=]() {
813  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
814  __HIP_MEMORY_SCOPE_AGENT);
815  });
816 #else
817  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
818 #endif // __gfx941__
819 }
820 
821 __device__
822 inline
823 unsigned long long atomicMax_system(unsigned long long* address, unsigned long long val) {
824 #if defined(__gfx941__)
825  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
826  address,
827  val,
828  [](unsigned long long x, unsigned long long y) { return y < x; },
829  [=]() {
830  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
831  __HIP_MEMORY_SCOPE_SYSTEM);
832  });
833 #else
834  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
835 #endif // __gfx941__
836 }
837 
838 __device__
839 inline
840 long long atomicMax(long long* address, long long val) {
841  #if defined(__gfx941__)
842  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
843  address, val, [](long long x, long long y) { return y < x; },
844  [=]() {
845  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
846  });
847 #else
848  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
849 #endif // __gfx941__
850 }
851 
852 __device__
853 inline
854 long long atomicMax_system(long long* address, long long val) {
855 #if defined(__gfx941__)
856  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
857  address, val, [](long long x, long long y) { return y < x; },
858  [=]() {
859  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
860  });
861 #else
862  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
863 #endif // __gfx941__
864 }
865 
866 __device__
867 inline
868 float atomicMax(float* addr, float val) {
869 #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
870  return unsafeAtomicMax(addr, val);
871 #else
872  typedef union u_hold {
873  float a;
874  unsigned int b;
875  } u_hold_t;
876  u_hold_t u{val};
877  bool neg_zero = 0x80000000U == u.b;
878  #if __has_builtin(__hip_atomic_load) && \
879  __has_builtin(__hip_atomic_compare_exchange_strong)
880  float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
881  bool done = false;
882  while (!done && (value < val || (neg_zero && value == 0.0f))) {
883  done = __hip_atomic_compare_exchange_strong(addr, &value, val,
884  __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
885  }
886  return value;
887  #else
888  unsigned int *uaddr = (unsigned int *)addr;
889  unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
890  bool done = false;
891  while (!done && (__uint_as_float(value) < val || (neg_zero && __uint_as_float(value) == 0.0f))) {
892  done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
893  __ATOMIC_RELAXED, __ATOMIC_RELAXED);
894  }
895  return __uint_as_float(value);
896  #endif
897 #endif
898 }
899 
900 __device__
901 inline
902 float atomicMax_system(float* address, float val) {
903  unsigned int* uaddr { reinterpret_cast<unsigned int*>(address) };
904  #if __has_builtin(__hip_atomic_load)
905  unsigned int tmp {__hip_atomic_load(uaddr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM)};
906  #else
907  unsigned int tmp {__atomic_load_n(uaddr, __ATOMIC_RELAXED)};
908  #endif
909  float value = __uint_as_float(tmp);
910 
911  while (value < val) {
912  value = atomicCAS_system(address, value, val);
913  }
914 
915  return value;
916 }
917 
918 __device__
919 inline
920 double atomicMax(double* addr, double val) {
921 #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
922  return unsafeAtomicMax(addr, val);
923 #else
924  typedef union u_hold {
925  double a;
926  unsigned long long b;
927  } u_hold_t;
928  u_hold_t u{val};
929  bool neg_zero = 0x8000000000000000ULL == u.b;
930  #if __has_builtin(__hip_atomic_load) && \
931  __has_builtin(__hip_atomic_compare_exchange_strong)
932  double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
933  bool done = false;
934  while (!done && (value < val || (neg_zero && value == 0.0))) {
935  done = __hip_atomic_compare_exchange_strong(addr, &value, val,
936  __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
937  }
938  return value;
939  #else
940  unsigned long long *uaddr = (unsigned long long *)addr;
941  unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
942  bool done = false;
943  while (!done &&
944  (__longlong_as_double(value) < val || (neg_zero && __longlong_as_double(value) == 0.0))) {
945  done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
946  __ATOMIC_RELAXED, __ATOMIC_RELAXED);
947  }
948  return __longlong_as_double(value);
949  #endif
950 #endif
951 }
952 
953 __device__
954 inline
955 double atomicMax_system(double* address, double val) {
956  unsigned long long* uaddr { reinterpret_cast<unsigned long long*>(address) };
957  #if __has_builtin(__hip_atomic_load)
958  unsigned long long tmp {__hip_atomic_load(uaddr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM)};
959  #else
960  unsigned long long tmp {__atomic_load_n(uaddr, __ATOMIC_RELAXED)};
961  #endif
962  double value = __longlong_as_double(tmp);
963 
964  while (value < val) {
965  value = atomicCAS_system(address, value, val);
966  }
967 
968  return value;
969 }
970 
971 __device__
972 inline
973 unsigned int atomicInc(unsigned int* address, unsigned int val)
974 {
975 #if defined(__gfx941__)
976  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
977  address,
978  val,
979  [](unsigned int& x, unsigned int y) { x = (x >= y) ? 0 : (x + 1); },
980  [=]() {
981  return
982  __builtin_amdgcn_atomic_inc32(address, val, __ATOMIC_RELAXED, "agent");
983  });
984 #else
985  return __builtin_amdgcn_atomic_inc32(address, val, __ATOMIC_RELAXED, "agent");
986 #endif // __gfx941__
987 
988 }
989 
990 __device__
991 inline
992 unsigned int atomicDec(unsigned int* address, unsigned int val)
993 {
994 #if defined(__gfx941__)
995  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
996  address,
997  val,
998  [](unsigned int& x, unsigned int y) { x = (!x || x > y) ? y : (x - 1); },
999  [=]() {
1000  return
1001  __builtin_amdgcn_atomic_dec32(address, val, __ATOMIC_RELAXED, "agent");
1002  });
1003 #else
1004  return __builtin_amdgcn_atomic_dec32(address, val, __ATOMIC_RELAXED, "agent");
1005 #endif // __gfx941__
1006 
1007 }
1008 
1009 __device__
1010 inline
1011 int atomicAnd(int* address, int val) {
1012 #if defined(__gfx941__)
1013  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
1014  address, val, [](int& x, int y) { x &= y; }, [=]() {
1015  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
1016  __HIP_MEMORY_SCOPE_AGENT);
1017  });
1018 #else
1019  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
1020 #endif // __gfx941__
1021 }
1022 
1023 __device__
1024 inline
1025 int atomicAnd_system(int* address, int val) {
1026 #if defined(__gfx941__)
1027  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
1028  address, val, [](int& x, int y) { x &= y; }, [=]() {
1029  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
1030  __HIP_MEMORY_SCOPE_SYSTEM);
1031  });
1032 #else
1033  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
1034 #endif // __gfx941__
1035 }
1036 
1037 __device__
1038 inline
1039 unsigned int atomicAnd(unsigned int* address, unsigned int val) {
1040 #if defined(__gfx941__)
1041  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
1042  address, val, [](unsigned int& x, unsigned int y) { x &= y; }, [=]() {
1043  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
1044  __HIP_MEMORY_SCOPE_AGENT);
1045  });
1046 #else
1047  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
1048 #endif // __gfx941__
1049 }
1050 
1051 __device__
1052 inline
1053 unsigned int atomicAnd_system(unsigned int* address, unsigned int val) {
1054 #if defined(__gfx941__)
1055  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
1056  address, val, [](unsigned int& x, unsigned int y) { x &= y; }, [=]() {
1057  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
1058  __HIP_MEMORY_SCOPE_SYSTEM);
1059  });
1060 #else
1061  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
1062 #endif // __gfx941__
1063 }
1064 
1065 __device__
1066 inline
1067 unsigned long atomicAnd(unsigned long* address, unsigned long val) {
1068 #if defined(__gfx941__)
1069  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
1070  address, val, [](unsigned long& x, unsigned long y) { x &= y; }, [=]() {
1071  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
1072  __HIP_MEMORY_SCOPE_AGENT);
1073  });
1074 #else
1075  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
1076 #endif // __gfx941__
1077 }
1078 
1079 __device__
1080 inline
1081 unsigned long atomicAnd_system(unsigned long* address, unsigned long val) {
1082 #if defined(__gfx941__)
1083  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
1084  address, val, [](unsigned long& x, unsigned long y) { x &= y; }, [=]() {
1085  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
1086  __HIP_MEMORY_SCOPE_SYSTEM);
1087  });
1088 #else
1089  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
1090 #endif // __gfx941__
1091 }
1092 
1093 __device__
1094 inline
1095 unsigned long long atomicAnd(unsigned long long* address, unsigned long long val) {
1096 #if defined(__gfx941__)
1097  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
1098  address,
1099  val,
1100  [](unsigned long long& x, unsigned long long y) { x &= y; },
1101  [=]() {
1102  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
1103  __HIP_MEMORY_SCOPE_AGENT);
1104  });
1105 #else
1106  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
1107 #endif // __gfx941__
1108 }
1109 
1110 __device__
1111 inline
1112 unsigned long long atomicAnd_system(unsigned long long* address, unsigned long long val) {
1113 #if defined(__gfx941__)
1114  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
1115  address,
1116  val,
1117  [](unsigned long long& x, unsigned long long y) { x &= y; },
1118  [=]() {
1119  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
1120  __HIP_MEMORY_SCOPE_SYSTEM);
1121  });
1122 #else
1123  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
1124 #endif // __gfx941__
1125 }
1126 
1127 __device__
1128 inline
1129 int atomicOr(int* address, int val) {
1130 #if defined(__gfx941__)
1131  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
1132  address, val, [](int& x, int y) { x |= y; }, [=]() {
1133  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
1134  __HIP_MEMORY_SCOPE_AGENT);
1135  });
1136 #else
1137  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
1138 #endif // __gfx941__
1139 }
1140 
1141 __device__
1142 inline
1143 int atomicOr_system(int* address, int val) {
1144 #if defined(__gfx941__)
1145  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
1146  address, val, [](int& x, int y) { x |= y; }, [=]() {
1147  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
1148  __HIP_MEMORY_SCOPE_SYSTEM);
1149  });
1150 #else
1151  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
1152 #endif // __gfx941__
1153 }
1154 
1155 __device__
1156 inline
1157 unsigned int atomicOr(unsigned int* address, unsigned int val) {
1158 #if defined(__gfx941__)
1159  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
1160  address, val, [](unsigned int& x, unsigned int y) { x |= y; }, [=]() {
1161  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
1162  __HIP_MEMORY_SCOPE_AGENT);
1163  });
1164 #else
1165  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
1166 #endif // __gfx941__
1167 }
1168 
1169 __device__
1170 inline
1171 unsigned int atomicOr_system(unsigned int* address, unsigned int val) {
1172 #if defined(__gfx941__)
1173  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
1174  address, val, [](unsigned int& x, unsigned int y) { x |= y; }, [=]() {
1175  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
1176  __HIP_MEMORY_SCOPE_SYSTEM);
1177  });
1178 #else
1179  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
1180 #endif // __gfx941__
1181 }
1182 
1183 __device__
1184 inline
1185 unsigned long atomicOr(unsigned long* address, unsigned long val) {
1186 #if defined(__gfx941__)
1187  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
1188  address, val, [](unsigned long& x, unsigned long y) { x |= y; }, [=]() {
1189  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
1190  __HIP_MEMORY_SCOPE_AGENT);
1191  });
1192 #else
1193  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
1194 #endif // __gfx941__
1195 }
1196 
1197 __device__
1198 inline
1199 unsigned long atomicOr_system(unsigned long* address, unsigned long val) {
1200 #if defined(__gfx941__)
1201  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
1202  address, val, [](unsigned long& x, unsigned long y) { x |= y; }, [=]() {
1203  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
1204  __HIP_MEMORY_SCOPE_SYSTEM);
1205  });
1206 #else
1207  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
1208 #endif // __gfx941__
1209 }
1210 
1211 __device__
1212 inline
1213 unsigned long long atomicOr(unsigned long long* address, unsigned long long val) {
1214 #if defined(__gfx941__)
1215  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
1216  address,
1217  val,
1218  [](unsigned long long& x, unsigned long long y) { x |= y; },
1219  [=]() {
1220  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
1221  __HIP_MEMORY_SCOPE_AGENT);
1222  });
1223 #else
1224  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
1225 #endif // __gfx941__
1226 }
1227 
1228 __device__
1229 inline
1230 unsigned long long atomicOr_system(unsigned long long* address, unsigned long long val) {
1231 #if defined(__gfx941__)
1232  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
1233  address,
1234  val,
1235  [](unsigned long long& x, unsigned long long y) { x |= y; },
1236  [=]() {
1237  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
1238  __HIP_MEMORY_SCOPE_SYSTEM);
1239  });
1240 #else
1241  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
1242 #endif // __gfx941__
1243 }
1244 
1245 __device__
1246 inline
1247 int atomicXor(int* address, int val) {
1248 #if defined(__gfx941__)
1249  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
1250  address, val, [](int& x, int y) { x ^= y; }, [=]() {
1251  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
1252  __HIP_MEMORY_SCOPE_AGENT);
1253  });
1254 #else
1255  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
1256 #endif // __gfx941__
1257 }
1258 
1259 __device__
1260 inline
1261 int atomicXor_system(int* address, int val) {
1262 #if defined(__gfx941__)
1263  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
1264  address, val, [](int& x, int y) { x ^= y; }, [=]() {
1265  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
1266  __HIP_MEMORY_SCOPE_SYSTEM);
1267  });
1268 #else
1269  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
1270 #endif // __gfx941__
1271 }
1272 
1273 __device__
1274 inline
1275 unsigned int atomicXor(unsigned int* address, unsigned int val) {
1276 #if defined(__gfx941__)
1277  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
1278  address, val, [](unsigned int& x, unsigned int y) { x ^= y; }, [=]() {
1279  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
1280  __HIP_MEMORY_SCOPE_AGENT);
1281  });
1282 #else
1283  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
1284 #endif // __gfx941__
1285 }
1286 
1287 __device__
1288 inline
1289 unsigned int atomicXor_system(unsigned int* address, unsigned int val) {
1290 #if defined(__gfx941__)
1291  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
1292  address, val, [](unsigned int& x, unsigned int y) { x ^= y; }, [=]() {
1293  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
1294  __HIP_MEMORY_SCOPE_SYSTEM);
1295  });
1296 #else
1297  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
1298 #endif // __gfx941__
1299 }
1300 
1301 __device__
1302 inline
1303 unsigned long atomicXor(unsigned long* address, unsigned long val) {
1304 #if defined(__gfx941__)
1305  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
1306  address, val, [](unsigned long& x, unsigned long y) { x ^= y; }, [=]() {
1307  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
1308  __HIP_MEMORY_SCOPE_AGENT);
1309  });
1310 #else
1311  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
1312 #endif // __gfx941__
1313 }
1314 
1315 __device__
1316 inline
1317 unsigned long atomicXor_system(unsigned long* address, unsigned long val) {
1318 #if defined(__gfx941__)
1319  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
1320  address, val, [](unsigned long& x, unsigned long y) { x ^= y; }, [=]() {
1321  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
1322  __HIP_MEMORY_SCOPE_SYSTEM);
1323  });
1324 #else
1325  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
1326 #endif // __gfx941__
1327 }
1328 
1329 __device__
1330 inline
1331 unsigned long long atomicXor(unsigned long long* address, unsigned long long val) {
1332 #if defined(__gfx941__)
1333  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
1334  address,
1335  val,
1336  [](unsigned long long& x, unsigned long long y) { x ^= y; },
1337  [=]() {
1338  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
1339  __HIP_MEMORY_SCOPE_AGENT);
1340  });
1341 #else
1342  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
1343 #endif // __gfx941__
1344 }
1345 
1346 __device__
1347 inline
1348 unsigned long long atomicXor_system(unsigned long long* address, unsigned long long val) {
1349  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
1350 }
1351 
1352 #else // __hip_atomic_compare_exchange_strong
1353 
1354 __device__
1355 inline
1356 int atomicCAS(int* address, int compare, int val)
1357 {
1358  __atomic_compare_exchange_n(
1359  address, &compare, val, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
1360 
1361  return compare;
1362 }
1363 __device__
1364 inline
1365 unsigned int atomicCAS(
1366  unsigned int* address, unsigned int compare, unsigned int val)
1367 {
1368  __atomic_compare_exchange_n(
1369  address, &compare, val, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
1370 
1371  return compare;
1372 }
1373 __device__
1374 inline
1375 unsigned long long atomicCAS(
1376  unsigned long long* address,
1377  unsigned long long compare,
1378  unsigned long long val)
1379 {
1380  __atomic_compare_exchange_n(
1381  address, &compare, val, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
1382 
1383  return compare;
1384 }
1385 
1386 __device__
1387 inline
1388 int atomicAdd(int* address, int val)
1389 {
1390  return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
1391 }
1392 __device__
1393 inline
1394 unsigned int atomicAdd(unsigned int* address, unsigned int val)
1395 {
1396  return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
1397 }
1398 __device__
1399 inline
1400 unsigned long long atomicAdd(
1401  unsigned long long* address, unsigned long long val)
1402 {
1403  return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
1404 }
1405 __device__
1406 inline
1407 float atomicAdd(float* address, float val)
1408 {
1409 #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
1410  return unsafeAtomicAdd(address, val);
1411 #else
1412  return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
1413 #endif
1414 }
1415 
1416 #if !defined(__HIPCC_RTC__)
1417 DEPRECATED("use atomicAdd instead")
1418 #endif // !defined(__HIPCC_RTC__)
1419 __device__
1420 inline
1421 void atomicAddNoRet(float* address, float val)
1422 {
1423  __ockl_atomic_add_noret_f32(address, val);
1424 }
1425 
1426 __device__
1427 inline
1428 double atomicAdd(double* address, double val)
1429 {
1430 #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
1431  return unsafeAtomicAdd(address, val);
1432 #else
1433  return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
1434 #endif
1435 }
1436 
1437 __device__
1438 inline
1439 int atomicSub(int* address, int val)
1440 {
1441  return __atomic_fetch_sub(address, val, __ATOMIC_RELAXED);
1442 }
1443 __device__
1444 inline
1445 unsigned int atomicSub(unsigned int* address, unsigned int val)
1446 {
1447  return __atomic_fetch_sub(address, val, __ATOMIC_RELAXED);
1448 }
1449 
1450 __device__
1451 inline
1452 int atomicExch(int* address, int val)
1453 {
1454  return __atomic_exchange_n(address, val, __ATOMIC_RELAXED);
1455 }
1456 __device__
1457 inline
1458 unsigned int atomicExch(unsigned int* address, unsigned int val)
1459 {
1460  return __atomic_exchange_n(address, val, __ATOMIC_RELAXED);
1461 }
1462 __device__
1463 inline
1464 unsigned long long atomicExch(unsigned long long* address, unsigned long long val)
1465 {
1466  return __atomic_exchange_n(address, val, __ATOMIC_RELAXED);
1467 }
1468 __device__
1469 inline
1470 float atomicExch(float* address, float val)
1471 {
1472  return __uint_as_float(__atomic_exchange_n(
1473  reinterpret_cast<unsigned int*>(address),
1474  __float_as_uint(val),
1475  __ATOMIC_RELAXED));
1476 }
1477 
1478 __device__
1479 inline
1480 int atomicMin(int* address, int val)
1481 {
1482  return __atomic_fetch_min(address, val, __ATOMIC_RELAXED);
1483 }
1484 __device__
1485 inline
1486 unsigned int atomicMin(unsigned int* address, unsigned int val)
1487 {
1488  return __atomic_fetch_min(address, val, __ATOMIC_RELAXED);
1489 }
1490 __device__
1491 inline
1492 unsigned long long atomicMin(
1493  unsigned long long* address, unsigned long long val)
1494 {
1495  unsigned long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)};
1496  while (val < tmp) {
1497  const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED);
1498 
1499  if (tmp1 != tmp) { tmp = tmp1; continue; }
1500 
1501  tmp = atomicCAS(address, tmp, val);
1502  }
1503 
1504  return tmp;
1505 }
1506 __device__ inline long long atomicMin(long long* address, long long val) {
1507  long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)};
1508  while (val < tmp) {
1509  const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED);
1510 
1511  if (tmp1 != tmp) {
1512  tmp = tmp1;
1513  continue;
1514  }
1515 
1516  tmp = atomicCAS(address, tmp, val);
1517  }
1518  return tmp;
1519 }
1520 
1521 __device__
1522 inline
1523 int atomicMax(int* address, int val)
1524 {
1525  return __atomic_fetch_max(address, val, __ATOMIC_RELAXED);
1526 }
1527 __device__
1528 inline
1529 unsigned int atomicMax(unsigned int* address, unsigned int val)
1530 {
1531  return __atomic_fetch_max(address, val, __ATOMIC_RELAXED);
1532 }
1533 __device__
1534 inline
1535 unsigned long long atomicMax(
1536  unsigned long long* address, unsigned long long val)
1537 {
1538  unsigned long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)};
1539  while (tmp < val) {
1540  const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED);
1541 
1542  if (tmp1 != tmp) { tmp = tmp1; continue; }
1543 
1544  tmp = atomicCAS(address, tmp, val);
1545  }
1546 
1547  return tmp;
1548 }
1549 __device__ inline long long atomicMax(long long* address, long long val) {
1550  long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)};
1551  while (tmp < val) {
1552  const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED);
1553 
1554  if (tmp1 != tmp) {
1555  tmp = tmp1;
1556  continue;
1557  }
1558 
1559  tmp = atomicCAS(address, tmp, val);
1560  }
1561  return tmp;
1562 }
1563 
1564 __device__
1565 inline
1566 unsigned int atomicInc(unsigned int* address, unsigned int val)
1567 {
1568  return __builtin_amdgcn_atomic_inc32(address, val, __ATOMIC_RELAXED, "agent");
1569 }
1570 
1571 __device__
1572 inline
1573 unsigned int atomicDec(unsigned int* address, unsigned int val)
1574 {
1575  return __builtin_amdgcn_atomic_dec32(address, val, __ATOMIC_RELAXED, "agent");
1576 }
1577 
1578 __device__
1579 inline
1580 int atomicAnd(int* address, int val)
1581 {
1582  return __atomic_fetch_and(address, val, __ATOMIC_RELAXED);
1583 }
1584 __device__
1585 inline
1586 unsigned int atomicAnd(unsigned int* address, unsigned int val)
1587 {
1588  return __atomic_fetch_and(address, val, __ATOMIC_RELAXED);
1589 }
1590 __device__
1591 inline
1592 unsigned long long atomicAnd(
1593  unsigned long long* address, unsigned long long val)
1594 {
1595  return __atomic_fetch_and(address, val, __ATOMIC_RELAXED);
1596 }
1597 
1598 __device__
1599 inline
1600 int atomicOr(int* address, int val)
1601 {
1602  return __atomic_fetch_or(address, val, __ATOMIC_RELAXED);
1603 }
1604 __device__
1605 inline
1606 unsigned int atomicOr(unsigned int* address, unsigned int val)
1607 {
1608  return __atomic_fetch_or(address, val, __ATOMIC_RELAXED);
1609 }
1610 __device__
1611 inline
1612 unsigned long long atomicOr(
1613  unsigned long long* address, unsigned long long val)
1614 {
1615  return __atomic_fetch_or(address, val, __ATOMIC_RELAXED);
1616 }
1617 
1618 __device__
1619 inline
1620 int atomicXor(int* address, int val)
1621 {
1622  return __atomic_fetch_xor(address, val, __ATOMIC_RELAXED);
1623 }
1624 __device__
1625 inline
1626 unsigned int atomicXor(unsigned int* address, unsigned int val)
1627 {
1628  return __atomic_fetch_xor(address, val, __ATOMIC_RELAXED);
1629 }
1630 __device__
1631 inline
1632 unsigned long long atomicXor(
1633  unsigned long long* address, unsigned long long val)
1634 {
1635  return __atomic_fetch_xor(address, val, __ATOMIC_RELAXED);
1636 }
1637 
1638 #endif // __hip_atomic_compare_exchange_strong
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Definition: hip_fp16_math_fwd.h:57