29 #ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_H
30 #define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_H
32 #include <hip/amd_detail/amd_hip_common.h>
34 #if !defined(__HIPCC_RTC__)
54 const char* amd_dbgapi_get_build_name();
63 const char* amd_dbgapi_get_git_hash();
72 size_t amd_dbgapi_get_build_id();
81 #if !defined(__HIPCC_RTC__)
91 #if !__HIP_NO_STD_DEFS__
92 typedef unsigned int uint32_t;
93 typedef unsigned long long uint64_t;
94 typedef signed int int32_t;
95 typedef signed long long int64_t;
105 #if __HIP_CLANG_ONLY__
107 #if !defined(__align__)
108 #define __align__(x) __attribute__((aligned(x)))
111 #define CUDA_SUCCESS hipSuccess
113 #if !defined(__HIPCC_RTC__)
114 #include <hip/hip_runtime_api.h>
115 #include <hip/amd_detail/amd_hip_atomic.h>
116 #include <hip/amd_detail/amd_device_functions.h>
117 #include <hip/amd_detail/amd_surface_functions.h>
118 #include <hip/amd_detail/texture_fetch_functions.h>
119 #include <hip/amd_detail/texture_indirect_functions.h>
120 extern int HIP_TRACE_API;
124 #include <hip/amd_detail/hip_ldg.h>
130 #if defined(__KALMAR_ACCELERATOR__) && !defined(__HCC_ACCELERATOR__)
131 #define __HCC_ACCELERATOR__ __KALMAR_ACCELERATOR__
135 #if (defined(__HCC_ACCELERATOR__) && (__HCC_ACCELERATOR__ != 0)) || __HIP_DEVICE_COMPILE__
139 #define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (1)
140 #define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (1)
141 #define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (1)
142 #define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (1)
143 #define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (1)
146 #define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (1)
147 #define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (1)
150 #define __HIP_ARCH_HAS_DOUBLES__ (1)
153 #define __HIP_ARCH_HAS_WARP_VOTE__ (1)
154 #define __HIP_ARCH_HAS_WARP_BALLOT__ (1)
155 #define __HIP_ARCH_HAS_WARP_SHUFFLE__ (1)
156 #define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0)
159 #define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (1)
160 #define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0)
163 #define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0)
164 #define __HIP_ARCH_HAS_3DGRID__ (1)
165 #define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0)
170 #define launch_bounds_impl0(requiredMaxThreadsPerBlock) \
171 __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock)))
172 #define launch_bounds_impl1(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor) \
173 __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock), \
174 amdgpu_waves_per_eu(minBlocksPerMultiprocessor)))
175 #define select_impl_(_1, _2, impl_, ...) impl_
176 #define __launch_bounds__(...) \
177 select_impl_(__VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0, )(__VA_ARGS__)
179 #if !defined(__HIPCC_RTC__)
180 __host__ inline void* __get_dynamicgroupbaseptr() {
return nullptr; }
191 #if !defined(__HIPCC_RTC__)
192 #define HIP_KERNEL_NAME(...) __VA_ARGS__
193 #define HIP_SYMBOL(X) X
195 typedef int hipLaunchParm;
197 template <std::size_t n,
typename... Ts,
198 typename std::enable_if<n ==
sizeof...(Ts)>::type* =
nullptr>
199 void pArgs(
const std::tuple<Ts...>&,
void*) {}
201 template <std::size_t n,
typename... Ts,
202 typename std::enable_if<n !=
sizeof...(Ts)>::type* =
nullptr>
203 void pArgs(
const std::tuple<Ts...>& formals,
void** _vargs) {
204 using T =
typename std::tuple_element<n, std::tuple<Ts...> >::type;
206 static_assert(!std::is_reference<T>{},
207 "A __global__ function cannot have a reference as one of its "
209 #if defined(HIP_STRICT)
210 static_assert(std::is_trivially_copyable<T>{},
211 "Only TriviallyCopyable types can be arguments to a __global__ "
214 _vargs[n] =
const_cast<void*
>(
reinterpret_cast<const void*
>(&std::get<n>(formals)));
215 return pArgs<n + 1>(formals, _vargs);
218 template <
typename... Formals,
typename... Actuals>
219 std::tuple<Formals...> validateArgsCountType(
void (*kernel)(Formals...), std::tuple<Actuals...>(actuals)) {
220 static_assert(
sizeof...(Formals) ==
sizeof...(Actuals),
"Argument Count Mismatch");
221 std::tuple<Formals...> to_formals{std::move(actuals)};
225 #if defined(HIP_TEMPLATE_KERNEL_LAUNCH)
226 template <
typename... Args,
typename F = void (*)(Args...)>
227 void hipLaunchKernelGGL(F kernel,
const dim3& numBlocks,
const dim3& dimBlocks,
228 std::uint32_t sharedMemBytes, hipStream_t stream, Args... args) {
229 constexpr
size_t count =
sizeof...(Args);
230 auto tup_ = std::tuple<Args...>{args...};
231 auto tup = validateArgsCountType(kernel, tup_);
233 pArgs<0>(tup, _Args);
235 auto k =
reinterpret_cast<void*
>(kernel);
236 hipLaunchKernel(k, numBlocks, dimBlocks, _Args, sharedMemBytes, stream);
239 #define hipLaunchKernelGGLInternal(kernelName, numBlocks, numThreads, memPerBlock, streamId, ...) \
241 kernelName<<<(numBlocks), (numThreads), (memPerBlock), (streamId)>>>(__VA_ARGS__); \
244 #define hipLaunchKernelGGL(kernelName, ...) hipLaunchKernelGGLInternal((kernelName), __VA_ARGS__)
247 #include <hip/hip_runtime_api.h>
250 #if defined(__HIPCC_RTC__)
251 typedef struct dim3 {
256 constexpr __device__ dim3(uint32_t _x = 1, uint32_t _y = 1, uint32_t _z = 1) : x(_x), y(_y), z(_z){};
261 #pragma push_macro("__DEVICE__")
262 #define __DEVICE__ static __device__ __forceinline__
264 extern "C" __device__
__attribute__((
const)) size_t __ockl_get_local_id(
unsigned int);
265 __DEVICE__
unsigned int __hip_get_thread_idx_x() {
return __ockl_get_local_id(0); }
266 __DEVICE__
unsigned int __hip_get_thread_idx_y() {
return __ockl_get_local_id(1); }
267 __DEVICE__
unsigned int __hip_get_thread_idx_z() {
return __ockl_get_local_id(2); }
269 extern "C" __device__
__attribute__((
const)) size_t __ockl_get_group_id(
unsigned int);
270 __DEVICE__
unsigned int __hip_get_block_idx_x() {
return __ockl_get_group_id(0); }
271 __DEVICE__
unsigned int __hip_get_block_idx_y() {
return __ockl_get_group_id(1); }
272 __DEVICE__
unsigned int __hip_get_block_idx_z() {
return __ockl_get_group_id(2); }
274 extern "C" __device__
__attribute__((
const)) size_t __ockl_get_local_size(
unsigned int);
275 __DEVICE__
unsigned int __hip_get_block_dim_x() {
return __ockl_get_local_size(0); }
276 __DEVICE__
unsigned int __hip_get_block_dim_y() {
return __ockl_get_local_size(1); }
277 __DEVICE__
unsigned int __hip_get_block_dim_z() {
return __ockl_get_local_size(2); }
279 extern "C" __device__
__attribute__((
const)) size_t __ockl_get_num_groups(
unsigned int);
280 __DEVICE__
unsigned int __hip_get_grid_dim_x() {
return __ockl_get_num_groups(0); }
281 __DEVICE__
unsigned int __hip_get_grid_dim_y() {
return __ockl_get_num_groups(1); }
282 __DEVICE__
unsigned int __hip_get_grid_dim_z() {
return __ockl_get_num_groups(2); }
284 #define __HIP_DEVICE_BUILTIN(DIMENSION, FUNCTION) \
285 __declspec(property(get = __get_##DIMENSION)) unsigned int DIMENSION; \
286 __DEVICE__ unsigned int __get_##DIMENSION(void) { \
290 struct __hip_builtin_threadIdx_t {
291 __HIP_DEVICE_BUILTIN(x,__hip_get_thread_idx_x());
292 __HIP_DEVICE_BUILTIN(y,__hip_get_thread_idx_y());
293 __HIP_DEVICE_BUILTIN(z,__hip_get_thread_idx_z());
295 __device__
operator dim3()
const {
return dim3(x, y, z); }
299 struct __hip_builtin_blockIdx_t {
300 __HIP_DEVICE_BUILTIN(x,__hip_get_block_idx_x());
301 __HIP_DEVICE_BUILTIN(y,__hip_get_block_idx_y());
302 __HIP_DEVICE_BUILTIN(z,__hip_get_block_idx_z());
304 __device__
operator dim3()
const {
return dim3(x, y, z); }
308 struct __hip_builtin_blockDim_t {
309 __HIP_DEVICE_BUILTIN(x,__hip_get_block_dim_x());
310 __HIP_DEVICE_BUILTIN(y,__hip_get_block_dim_y());
311 __HIP_DEVICE_BUILTIN(z,__hip_get_block_dim_z());
313 __device__
operator dim3()
const {
return dim3(x, y, z); }
317 struct __hip_builtin_gridDim_t {
318 __HIP_DEVICE_BUILTIN(x,__hip_get_grid_dim_x());
319 __HIP_DEVICE_BUILTIN(y,__hip_get_grid_dim_y());
320 __HIP_DEVICE_BUILTIN(z,__hip_get_grid_dim_z());
322 __device__
operator dim3()
const {
return dim3(x, y, z); }
326 #undef __HIP_DEVICE_BUILTIN
327 #pragma pop_macro("__DEVICE__")
329 extern const __device__
__attribute__((weak)) __hip_builtin_threadIdx_t threadIdx;
330 extern const __device__
__attribute__((weak)) __hip_builtin_blockIdx_t blockIdx;
331 extern const __device__
__attribute__((weak)) __hip_builtin_blockDim_t blockDim;
332 extern const __device__
__attribute__((weak)) __hip_builtin_gridDim_t gridDim;
334 #define hipThreadIdx_x threadIdx.x
335 #define hipThreadIdx_y threadIdx.y
336 #define hipThreadIdx_z threadIdx.z
338 #define hipBlockIdx_x blockIdx.x
339 #define hipBlockIdx_y blockIdx.y
340 #define hipBlockIdx_z blockIdx.z
342 #define hipBlockDim_x blockDim.x
343 #define hipBlockDim_y blockDim.y
344 #define hipBlockDim_z blockDim.z
346 #define hipGridDim_x gridDim.x
347 #define hipGridDim_y gridDim.y
348 #define hipGridDim_z gridDim.z
350 #if !defined(__HIPCC_RTC__)
351 #include <hip/amd_detail/amd_math_functions.h>
354 #if __HIP_HCC_COMPAT_MODE__
356 #pragma push_macro("__DEFINE_HCC_FUNC")
357 #define __DEFINE_HCC_FUNC(hc_fun,hip_var) \
358 inline __device__ __attribute__((always_inline)) unsigned int hc_get_##hc_fun(unsigned int i) { \
367 __DEFINE_HCC_FUNC(workitem_id, threadIdx)
368 __DEFINE_HCC_FUNC(group_id, blockIdx)
369 __DEFINE_HCC_FUNC(group_size, blockDim)
370 __DEFINE_HCC_FUNC(num_groups, gridDim)
371 #pragma pop_macro("__DEFINE_HCC_FUNC")
373 extern "C" __device__
__attribute__((
const)) size_t __ockl_get_global_id(
unsigned int);
374 inline __device__
__attribute__((always_inline))
unsigned int
375 hc_get_workitem_absolute_id(
int dim)
377 return (
unsigned int)__ockl_get_global_id(dim);
382 #if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
383 #if !defined(__HIPCC_RTC__)
385 #if !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
386 #pragma push_macro("__CUDA__")
388 #include <__clang_cuda_math_forward_declares.h>
389 #include <__clang_cuda_complex_builtins.h>
395 #include <include/cuda_wrappers/algorithm>
396 #include <include/cuda_wrappers/complex>
397 #include <include/cuda_wrappers/new>
399 #pragma pop_macro("__CUDA__")
#define __host__
Definition: host_defines.h:170
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Definition: hip_fp16_math_fwd.h:57