HIP: Heterogenous-computing Interface for Portability
amd_hip_runtime.h
1 /*
2 Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
3 
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 THE SOFTWARE.
21 */
22 
28 //#pragma once
29 #ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_H
30 #define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_H
31 
32 #include <hip/amd_detail/amd_hip_common.h>
33 
34 #if !defined(__HIPCC_RTC__)
35 #ifdef __cplusplus
36 #include <cstddef>
37 #else
38 #include <stddef.h>
39 #endif // __cplusplus
40 #endif // !defined(__HIPCC_RTC__)
41 
42 #ifdef __cplusplus
43 extern "C" {
44 #endif
45 
54 const char* amd_dbgapi_get_build_name();
55 
63 const char* amd_dbgapi_get_git_hash();
64 
72 size_t amd_dbgapi_get_build_id();
73 
74 #ifdef __cplusplus
75 } /* extern "c" */
76 #endif
77 
78 //---
79 // Top part of file can be compiled with any compiler
80 
81 #if !defined(__HIPCC_RTC__)
82 #ifdef __cplusplus
83 #include <cmath>
84 #include <cstdint>
85 #include <tuple>
86 #else
87 #include <math.h>
88 #include <stdint.h>
89 #endif // __cplusplus
90 #else
91 #if !__HIP_NO_STD_DEFS__
92 typedef unsigned int uint32_t;
93 typedef unsigned long long uint64_t;
94 typedef signed int int32_t;
95 typedef signed long long int64_t;
96 namespace std {
97 using ::uint32_t;
98 using ::uint64_t;
99 using ::int32_t;
100 using ::int64_t;
101 }
102 #endif // __HIP_NO_STD_DEFS__
103 #endif // !defined(__HIPCC_RTC__)
104 
105 #if __HIP_CLANG_ONLY__
106 
107 #if !defined(__align__)
108 #define __align__(x) __attribute__((aligned(x)))
109 #endif
110 
111 #define CUDA_SUCCESS hipSuccess
112 
113 #if !defined(__HIPCC_RTC__)
114 #include <hip/hip_runtime_api.h>
115 #include <hip/amd_detail/amd_hip_atomic.h>
116 #include <hip/amd_detail/amd_device_functions.h>
117 #include <hip/amd_detail/amd_surface_functions.h>
118 #include <hip/amd_detail/texture_fetch_functions.h>
119 #include <hip/amd_detail/texture_indirect_functions.h>
120 extern int HIP_TRACE_API;
121 #endif // !defined(__HIPCC_RTC__)
122 
123 #ifdef __cplusplus
124 #include <hip/amd_detail/hip_ldg.h>
125 #endif
126 
128 
129 // TODO-HCC remove old definitions ; ~1602 hcc supports __HCC_ACCELERATOR__ define.
130 #if defined(__KALMAR_ACCELERATOR__) && !defined(__HCC_ACCELERATOR__)
131 #define __HCC_ACCELERATOR__ __KALMAR_ACCELERATOR__
132 #endif
133 
134 // Feature tests:
135 #if (defined(__HCC_ACCELERATOR__) && (__HCC_ACCELERATOR__ != 0)) || __HIP_DEVICE_COMPILE__
136 // Device compile and not host compile:
137 
138 // 32-bit Atomics:
139 #define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (1)
140 #define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (1)
141 #define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (1)
142 #define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (1)
143 #define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (1)
144 
145 // 64-bit Atomics:
146 #define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (1)
147 #define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (1)
148 
149 // Doubles
150 #define __HIP_ARCH_HAS_DOUBLES__ (1)
151 
152 // warp cross-lane operations:
153 #define __HIP_ARCH_HAS_WARP_VOTE__ (1)
154 #define __HIP_ARCH_HAS_WARP_BALLOT__ (1)
155 #define __HIP_ARCH_HAS_WARP_SHUFFLE__ (1)
156 #define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0)
157 
158 // sync
159 #define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (1)
160 #define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0)
161 
162 // misc
163 #define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0)
164 #define __HIP_ARCH_HAS_3DGRID__ (1)
165 #define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0)
166 
167 #endif /* Device feature flags */
168 
169 
170 #define launch_bounds_impl0(requiredMaxThreadsPerBlock) \
171  __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock)))
172 #define launch_bounds_impl1(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor) \
173  __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock), \
174  amdgpu_waves_per_eu(minBlocksPerMultiprocessor)))
175 #define select_impl_(_1, _2, impl_, ...) impl_
176 #define __launch_bounds__(...) \
177  select_impl_(__VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0, )(__VA_ARGS__)
178 
179 #if !defined(__HIPCC_RTC__)
180 __host__ inline void* __get_dynamicgroupbaseptr() { return nullptr; }
181 #endif // !defined(__HIPCC_RTC__)
182 
183 // End doxygen API:
188 //
189 // hip-clang functions
190 //
191 #if !defined(__HIPCC_RTC__)
192 #define HIP_KERNEL_NAME(...) __VA_ARGS__
193 #define HIP_SYMBOL(X) X
194 
195 typedef int hipLaunchParm;
196 
197 template <std::size_t n, typename... Ts,
198  typename std::enable_if<n == sizeof...(Ts)>::type* = nullptr>
199 void pArgs(const std::tuple<Ts...>&, void*) {}
200 
201 template <std::size_t n, typename... Ts,
202  typename std::enable_if<n != sizeof...(Ts)>::type* = nullptr>
203 void pArgs(const std::tuple<Ts...>& formals, void** _vargs) {
204  using T = typename std::tuple_element<n, std::tuple<Ts...> >::type;
205 
206  static_assert(!std::is_reference<T>{},
207  "A __global__ function cannot have a reference as one of its "
208  "arguments.");
209 #if defined(HIP_STRICT)
210  static_assert(std::is_trivially_copyable<T>{},
211  "Only TriviallyCopyable types can be arguments to a __global__ "
212  "function");
213 #endif
214  _vargs[n] = const_cast<void*>(reinterpret_cast<const void*>(&std::get<n>(formals)));
215  return pArgs<n + 1>(formals, _vargs);
216 }
217 
218 template <typename... Formals, typename... Actuals>
219 std::tuple<Formals...> validateArgsCountType(void (*kernel)(Formals...), std::tuple<Actuals...>(actuals)) {
220  static_assert(sizeof...(Formals) == sizeof...(Actuals), "Argument Count Mismatch");
221  std::tuple<Formals...> to_formals{std::move(actuals)};
222  return to_formals;
223 }
224 
225 #if defined(HIP_TEMPLATE_KERNEL_LAUNCH)
226 template <typename... Args, typename F = void (*)(Args...)>
227 void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
228  std::uint32_t sharedMemBytes, hipStream_t stream, Args... args) {
229  constexpr size_t count = sizeof...(Args);
230  auto tup_ = std::tuple<Args...>{args...};
231  auto tup = validateArgsCountType(kernel, tup_);
232  void* _Args[count];
233  pArgs<0>(tup, _Args);
234 
235  auto k = reinterpret_cast<void*>(kernel);
236  hipLaunchKernel(k, numBlocks, dimBlocks, _Args, sharedMemBytes, stream);
237 }
238 #else
239 #define hipLaunchKernelGGLInternal(kernelName, numBlocks, numThreads, memPerBlock, streamId, ...) \
240  do { \
241  kernelName<<<(numBlocks), (numThreads), (memPerBlock), (streamId)>>>(__VA_ARGS__); \
242  } while (0)
243 
244 #define hipLaunchKernelGGL(kernelName, ...) hipLaunchKernelGGLInternal((kernelName), __VA_ARGS__)
245 #endif
246 
247 #include <hip/hip_runtime_api.h>
248 #endif // !defined(__HIPCC_RTC__)
249 
250 #if defined(__HIPCC_RTC__)
251 typedef struct dim3 {
252  uint32_t x;
253  uint32_t y;
254  uint32_t z;
255 #ifdef __cplusplus
256  constexpr __device__ dim3(uint32_t _x = 1, uint32_t _y = 1, uint32_t _z = 1) : x(_x), y(_y), z(_z){};
257 #endif
258 } dim3;
259 #endif // !defined(__HIPCC_RTC__)
260 
261 #pragma push_macro("__DEVICE__")
262 #define __DEVICE__ static __device__ __forceinline__
263 
264 extern "C" __device__ __attribute__((const)) size_t __ockl_get_local_id(unsigned int);
265 __DEVICE__ unsigned int __hip_get_thread_idx_x() { return __ockl_get_local_id(0); }
266 __DEVICE__ unsigned int __hip_get_thread_idx_y() { return __ockl_get_local_id(1); }
267 __DEVICE__ unsigned int __hip_get_thread_idx_z() { return __ockl_get_local_id(2); }
268 
269 extern "C" __device__ __attribute__((const)) size_t __ockl_get_group_id(unsigned int);
270 __DEVICE__ unsigned int __hip_get_block_idx_x() { return __ockl_get_group_id(0); }
271 __DEVICE__ unsigned int __hip_get_block_idx_y() { return __ockl_get_group_id(1); }
272 __DEVICE__ unsigned int __hip_get_block_idx_z() { return __ockl_get_group_id(2); }
273 
274 extern "C" __device__ __attribute__((const)) size_t __ockl_get_local_size(unsigned int);
275 __DEVICE__ unsigned int __hip_get_block_dim_x() { return __ockl_get_local_size(0); }
276 __DEVICE__ unsigned int __hip_get_block_dim_y() { return __ockl_get_local_size(1); }
277 __DEVICE__ unsigned int __hip_get_block_dim_z() { return __ockl_get_local_size(2); }
278 
279 extern "C" __device__ __attribute__((const)) size_t __ockl_get_num_groups(unsigned int);
280 __DEVICE__ unsigned int __hip_get_grid_dim_x() { return __ockl_get_num_groups(0); }
281 __DEVICE__ unsigned int __hip_get_grid_dim_y() { return __ockl_get_num_groups(1); }
282 __DEVICE__ unsigned int __hip_get_grid_dim_z() { return __ockl_get_num_groups(2); }
283 
284 #define __HIP_DEVICE_BUILTIN(DIMENSION, FUNCTION) \
285  __declspec(property(get = __get_##DIMENSION)) unsigned int DIMENSION; \
286  __DEVICE__ unsigned int __get_##DIMENSION(void) { \
287  return FUNCTION; \
288  }
289 
290 struct __hip_builtin_threadIdx_t {
291  __HIP_DEVICE_BUILTIN(x,__hip_get_thread_idx_x());
292  __HIP_DEVICE_BUILTIN(y,__hip_get_thread_idx_y());
293  __HIP_DEVICE_BUILTIN(z,__hip_get_thread_idx_z());
294 #ifdef __cplusplus
295  __device__ operator dim3() const { return dim3(x, y, z); }
296 #endif
297 };
298 
299 struct __hip_builtin_blockIdx_t {
300  __HIP_DEVICE_BUILTIN(x,__hip_get_block_idx_x());
301  __HIP_DEVICE_BUILTIN(y,__hip_get_block_idx_y());
302  __HIP_DEVICE_BUILTIN(z,__hip_get_block_idx_z());
303 #ifdef __cplusplus
304  __device__ operator dim3() const { return dim3(x, y, z); }
305 #endif
306 };
307 
308 struct __hip_builtin_blockDim_t {
309  __HIP_DEVICE_BUILTIN(x,__hip_get_block_dim_x());
310  __HIP_DEVICE_BUILTIN(y,__hip_get_block_dim_y());
311  __HIP_DEVICE_BUILTIN(z,__hip_get_block_dim_z());
312 #ifdef __cplusplus
313  __device__ operator dim3() const { return dim3(x, y, z); }
314 #endif
315 };
316 
317 struct __hip_builtin_gridDim_t {
318  __HIP_DEVICE_BUILTIN(x,__hip_get_grid_dim_x());
319  __HIP_DEVICE_BUILTIN(y,__hip_get_grid_dim_y());
320  __HIP_DEVICE_BUILTIN(z,__hip_get_grid_dim_z());
321 #ifdef __cplusplus
322  __device__ operator dim3() const { return dim3(x, y, z); }
323 #endif
324 };
325 
326 #undef __HIP_DEVICE_BUILTIN
327 #pragma pop_macro("__DEVICE__")
328 
329 extern const __device__ __attribute__((weak)) __hip_builtin_threadIdx_t threadIdx;
330 extern const __device__ __attribute__((weak)) __hip_builtin_blockIdx_t blockIdx;
331 extern const __device__ __attribute__((weak)) __hip_builtin_blockDim_t blockDim;
332 extern const __device__ __attribute__((weak)) __hip_builtin_gridDim_t gridDim;
333 
334 #define hipThreadIdx_x threadIdx.x
335 #define hipThreadIdx_y threadIdx.y
336 #define hipThreadIdx_z threadIdx.z
337 
338 #define hipBlockIdx_x blockIdx.x
339 #define hipBlockIdx_y blockIdx.y
340 #define hipBlockIdx_z blockIdx.z
341 
342 #define hipBlockDim_x blockDim.x
343 #define hipBlockDim_y blockDim.y
344 #define hipBlockDim_z blockDim.z
345 
346 #define hipGridDim_x gridDim.x
347 #define hipGridDim_y gridDim.y
348 #define hipGridDim_z gridDim.z
349 
350 #if !defined(__HIPCC_RTC__)
351 #include <hip/amd_detail/amd_math_functions.h>
352 #endif
353 
354 #if __HIP_HCC_COMPAT_MODE__
355 // Define HCC work item functions in terms of HIP builtin variables.
356 #pragma push_macro("__DEFINE_HCC_FUNC")
357 #define __DEFINE_HCC_FUNC(hc_fun,hip_var) \
358 inline __device__ __attribute__((always_inline)) unsigned int hc_get_##hc_fun(unsigned int i) { \
359  if (i==0) \
360  return hip_var.x; \
361  else if(i==1) \
362  return hip_var.y; \
363  else \
364  return hip_var.z; \
365 }
366 
367 __DEFINE_HCC_FUNC(workitem_id, threadIdx)
368 __DEFINE_HCC_FUNC(group_id, blockIdx)
369 __DEFINE_HCC_FUNC(group_size, blockDim)
370 __DEFINE_HCC_FUNC(num_groups, gridDim)
371 #pragma pop_macro("__DEFINE_HCC_FUNC")
372 
373 extern "C" __device__ __attribute__((const)) size_t __ockl_get_global_id(unsigned int);
374 inline __device__ __attribute__((always_inline)) unsigned int
375 hc_get_workitem_absolute_id(int dim)
376 {
377  return (unsigned int)__ockl_get_global_id(dim);
378 }
379 
380 #endif
381 
382 #if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
383 #if !defined(__HIPCC_RTC__)
384 // Support std::complex.
385 #if !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
386 #pragma push_macro("__CUDA__")
387 #define __CUDA__
388 #include <__clang_cuda_math_forward_declares.h>
389 #include <__clang_cuda_complex_builtins.h>
390 // Workaround for using libc++ with HIP-Clang.
391 // The following headers requires clang include path before standard C++ include path.
392 // However libc++ include path requires to be before clang include path.
393 // To workaround this, we pass -isystem with the parent directory of clang include
394 // path instead of the clang include path itself.
395 #include <include/cuda_wrappers/algorithm>
396 #include <include/cuda_wrappers/complex>
397 #include <include/cuda_wrappers/new>
398 #undef __CUDA__
399 #pragma pop_macro("__CUDA__")
400 #endif // !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
401 #endif // !defined(__HIPCC_RTC__)
402 #endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
403 #endif // __HIP_CLANG_ONLY__
404 
405 #endif // HIP_AMD_DETAIL_RUNTIME_H
#define __host__
Definition: host_defines.h:170
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Definition: hip_fp16_math_fwd.h:57