HIP: Heterogenous-computing Interface for Portability
amd_warp_functions.h
1 /*
2 Copyright (c) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved.
3 
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 THE SOFTWARE.
21 */
22 
23 #ifndef HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H
24 #define HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H
25 
26 __device__ static inline unsigned __hip_ds_bpermute(int index, unsigned src) {
27  union { int i; unsigned u; float f; } tmp; tmp.u = src;
28  tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
29  return tmp.u;
30 }
31 
32 __device__ static inline float __hip_ds_bpermutef(int index, float src) {
33  union { int i; unsigned u; float f; } tmp; tmp.f = src;
34  tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
35  return tmp.f;
36 }
37 
38 __device__ static inline unsigned __hip_ds_permute(int index, unsigned src) {
39  union { int i; unsigned u; float f; } tmp; tmp.u = src;
40  tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
41  return tmp.u;
42 }
43 
44 __device__ static inline float __hip_ds_permutef(int index, float src) {
45  union { int i; unsigned u; float f; } tmp; tmp.f = src;
46  tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
47  return tmp.f;
48 }
49 
50 #define __hip_ds_swizzle(src, pattern) __hip_ds_swizzle_N<(pattern)>((src))
51 #define __hip_ds_swizzlef(src, pattern) __hip_ds_swizzlef_N<(pattern)>((src))
52 
53 template <int pattern>
54 __device__ static inline unsigned __hip_ds_swizzle_N(unsigned int src) {
55  union { int i; unsigned u; float f; } tmp; tmp.u = src;
56  tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
57  return tmp.u;
58 }
59 
60 template <int pattern>
61 __device__ static inline float __hip_ds_swizzlef_N(float src) {
62  union { int i; unsigned u; float f; } tmp; tmp.f = src;
63  tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
64  return tmp.f;
65 }
66 
67 #define __hip_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl) \
68  __hip_move_dpp_N<(dpp_ctrl), (row_mask), (bank_mask), (bound_ctrl)>((src))
69 
70 template <int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl>
71 __device__ static inline int __hip_move_dpp_N(int src) {
72  return __builtin_amdgcn_mov_dpp(src, dpp_ctrl, row_mask, bank_mask,
73  bound_ctrl);
74 }
75 
76 static constexpr int warpSize = __AMDGCN_WAVEFRONT_SIZE;
77 
78 // warp vote function __all __any __ballot
79 __device__
80 inline
81 int __all(int predicate) {
82  return __ockl_wfall_i32(predicate);
83 }
84 
85 __device__
86 inline
87 int __any(int predicate) {
88  return __ockl_wfany_i32(predicate);
89 }
90 
91 // XXX from llvm/include/llvm/IR/InstrTypes.h
92 #define ICMP_NE 33
93 
94 __device__
95 inline
96 unsigned long long int __ballot(int predicate) {
97  return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
98 }
99 
100 __device__
101 inline
102 unsigned long long int __ballot64(int predicate) {
103  return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
104 }
105 
106 // See amd_warp_sync_functions.h for an explanation of this preprocessor flag.
107 #ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
108 // Since threads in a wave do not make independent progress, __activemask()
109 // always returns the exact active mask, i.e, all active threads in the wave.
110 __device__
111 inline
112 unsigned long long __activemask() {
113  return __ballot(true);
114 }
115 #endif // HIP_ENABLE_WARP_SYNC_BUILTINS
116 
117 __device__ static inline unsigned int __lane_id() {
118  return __builtin_amdgcn_mbcnt_hi(
119  -1, __builtin_amdgcn_mbcnt_lo(-1, 0));
120 }
121 
122 __device__
123 inline
124 int __shfl(int var, int src_lane, int width = warpSize) {
125  int self = __lane_id();
126  int index = (src_lane & (width - 1)) + (self & ~(width-1));
127  return __builtin_amdgcn_ds_bpermute(index<<2, var);
128 }
129 __device__
130 inline
131 unsigned int __shfl(unsigned int var, int src_lane, int width = warpSize) {
132  union { int i; unsigned u; float f; } tmp; tmp.u = var;
133  tmp.i = __shfl(tmp.i, src_lane, width);
134  return tmp.u;
135 }
136 __device__
137 inline
138 float __shfl(float var, int src_lane, int width = warpSize) {
139  union { int i; unsigned u; float f; } tmp; tmp.f = var;
140  tmp.i = __shfl(tmp.i, src_lane, width);
141  return tmp.f;
142 }
143 __device__
144 inline
145 double __shfl(double var, int src_lane, int width = warpSize) {
146  static_assert(sizeof(double) == 2 * sizeof(int), "");
147  static_assert(sizeof(double) == sizeof(uint64_t), "");
148 
149  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
150  tmp[0] = __shfl(tmp[0], src_lane, width);
151  tmp[1] = __shfl(tmp[1], src_lane, width);
152 
153  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
154  double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
155  return tmp1;
156 }
157 __device__
158 inline
159 long __shfl(long var, int src_lane, int width = warpSize)
160 {
161  #ifndef _MSC_VER
162  static_assert(sizeof(long) == 2 * sizeof(int), "");
163  static_assert(sizeof(long) == sizeof(uint64_t), "");
164 
165  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
166  tmp[0] = __shfl(tmp[0], src_lane, width);
167  tmp[1] = __shfl(tmp[1], src_lane, width);
168 
169  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
170  long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
171  return tmp1;
172  #else
173  static_assert(sizeof(long) == sizeof(int), "");
174  return static_cast<long>(__shfl(static_cast<int>(var), src_lane, width));
175  #endif
176 }
177 __device__
178 inline
179 unsigned long __shfl(unsigned long var, int src_lane, int width = warpSize) {
180  #ifndef _MSC_VER
181  static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
182  static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
183 
184  unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
185  tmp[0] = __shfl(tmp[0], src_lane, width);
186  tmp[1] = __shfl(tmp[1], src_lane, width);
187 
188  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
189  unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
190  return tmp1;
191  #else
192  static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
193  return static_cast<unsigned long>(__shfl(static_cast<unsigned int>(var), src_lane, width));
194  #endif
195 }
196 __device__
197 inline
198 long long __shfl(long long var, int src_lane, int width = warpSize)
199 {
200  static_assert(sizeof(long long) == 2 * sizeof(int), "");
201  static_assert(sizeof(long long) == sizeof(uint64_t), "");
202 
203  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
204  tmp[0] = __shfl(tmp[0], src_lane, width);
205  tmp[1] = __shfl(tmp[1], src_lane, width);
206 
207  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
208  long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
209  return tmp1;
210 }
211 __device__
212 inline
213 unsigned long long __shfl(unsigned long long var, int src_lane, int width = warpSize) {
214  static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
215  static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
216 
217  unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
218  tmp[0] = __shfl(tmp[0], src_lane, width);
219  tmp[1] = __shfl(tmp[1], src_lane, width);
220 
221  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
222  unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
223  return tmp1;
224 }
225 
226 __device__
227 inline
228 int __shfl_up(int var, unsigned int lane_delta, int width = warpSize) {
229  int self = __lane_id();
230  int index = self - lane_delta;
231  index = (index < (self & ~(width-1)))?self:index;
232  return __builtin_amdgcn_ds_bpermute(index<<2, var);
233 }
234 __device__
235 inline
236 unsigned int __shfl_up(unsigned int var, unsigned int lane_delta, int width = warpSize) {
237  union { int i; unsigned u; float f; } tmp; tmp.u = var;
238  tmp.i = __shfl_up(tmp.i, lane_delta, width);
239  return tmp.u;
240 }
241 __device__
242 inline
243 float __shfl_up(float var, unsigned int lane_delta, int width = warpSize) {
244  union { int i; unsigned u; float f; } tmp; tmp.f = var;
245  tmp.i = __shfl_up(tmp.i, lane_delta, width);
246  return tmp.f;
247 }
248 __device__
249 inline
250 double __shfl_up(double var, unsigned int lane_delta, int width = warpSize) {
251  static_assert(sizeof(double) == 2 * sizeof(int), "");
252  static_assert(sizeof(double) == sizeof(uint64_t), "");
253 
254  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
255  tmp[0] = __shfl_up(tmp[0], lane_delta, width);
256  tmp[1] = __shfl_up(tmp[1], lane_delta, width);
257 
258  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
259  double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
260  return tmp1;
261 }
262 __device__
263 inline
264 long __shfl_up(long var, unsigned int lane_delta, int width = warpSize)
265 {
266  #ifndef _MSC_VER
267  static_assert(sizeof(long) == 2 * sizeof(int), "");
268  static_assert(sizeof(long) == sizeof(uint64_t), "");
269 
270  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
271  tmp[0] = __shfl_up(tmp[0], lane_delta, width);
272  tmp[1] = __shfl_up(tmp[1], lane_delta, width);
273 
274  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
275  long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
276  return tmp1;
277  #else
278  static_assert(sizeof(long) == sizeof(int), "");
279  return static_cast<long>(__shfl_up(static_cast<int>(var), lane_delta, width));
280  #endif
281 }
282 
283 __device__
284 inline
285 unsigned long __shfl_up(unsigned long var, unsigned int lane_delta, int width = warpSize)
286 {
287  #ifndef _MSC_VER
288  static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
289  static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
290 
291  unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
292  tmp[0] = __shfl_up(tmp[0], lane_delta, width);
293  tmp[1] = __shfl_up(tmp[1], lane_delta, width);
294 
295  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
296  unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
297  return tmp1;
298  #else
299  static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
300  return static_cast<unsigned long>(__shfl_up(static_cast<unsigned int>(var), lane_delta, width));
301  #endif
302 }
303 
304 __device__
305 inline
306 long long __shfl_up(long long var, unsigned int lane_delta, int width = warpSize)
307 {
308  static_assert(sizeof(long long) == 2 * sizeof(int), "");
309  static_assert(sizeof(long long) == sizeof(uint64_t), "");
310  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
311  tmp[0] = __shfl_up(tmp[0], lane_delta, width);
312  tmp[1] = __shfl_up(tmp[1], lane_delta, width);
313  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
314  long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
315  return tmp1;
316 }
317 
318 __device__
319 inline
320 unsigned long long __shfl_up(unsigned long long var, unsigned int lane_delta, int width = warpSize)
321 {
322  static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
323  static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
324  unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
325  tmp[0] = __shfl_up(tmp[0], lane_delta, width);
326  tmp[1] = __shfl_up(tmp[1], lane_delta, width);
327  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
328  unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
329  return tmp1;
330 }
331 
332 __device__
333 inline
334 int __shfl_down(int var, unsigned int lane_delta, int width = warpSize) {
335  int self = __lane_id();
336  int index = self + lane_delta;
337  index = (int)((self&(width-1))+lane_delta) >= width?self:index;
338  return __builtin_amdgcn_ds_bpermute(index<<2, var);
339 }
340 __device__
341 inline
342 unsigned int __shfl_down(unsigned int var, unsigned int lane_delta, int width = warpSize) {
343  union { int i; unsigned u; float f; } tmp; tmp.u = var;
344  tmp.i = __shfl_down(tmp.i, lane_delta, width);
345  return tmp.u;
346 }
347 __device__
348 inline
349 float __shfl_down(float var, unsigned int lane_delta, int width = warpSize) {
350  union { int i; unsigned u; float f; } tmp; tmp.f = var;
351  tmp.i = __shfl_down(tmp.i, lane_delta, width);
352  return tmp.f;
353 }
354 __device__
355 inline
356 double __shfl_down(double var, unsigned int lane_delta, int width = warpSize) {
357  static_assert(sizeof(double) == 2 * sizeof(int), "");
358  static_assert(sizeof(double) == sizeof(uint64_t), "");
359 
360  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
361  tmp[0] = __shfl_down(tmp[0], lane_delta, width);
362  tmp[1] = __shfl_down(tmp[1], lane_delta, width);
363 
364  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
365  double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
366  return tmp1;
367 }
368 __device__
369 inline
370 long __shfl_down(long var, unsigned int lane_delta, int width = warpSize)
371 {
372  #ifndef _MSC_VER
373  static_assert(sizeof(long) == 2 * sizeof(int), "");
374  static_assert(sizeof(long) == sizeof(uint64_t), "");
375 
376  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
377  tmp[0] = __shfl_down(tmp[0], lane_delta, width);
378  tmp[1] = __shfl_down(tmp[1], lane_delta, width);
379 
380  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
381  long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
382  return tmp1;
383  #else
384  static_assert(sizeof(long) == sizeof(int), "");
385  return static_cast<long>(__shfl_down(static_cast<int>(var), lane_delta, width));
386  #endif
387 }
388 __device__
389 inline
390 unsigned long __shfl_down(unsigned long var, unsigned int lane_delta, int width = warpSize)
391 {
392  #ifndef _MSC_VER
393  static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
394  static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
395 
396  unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
397  tmp[0] = __shfl_down(tmp[0], lane_delta, width);
398  tmp[1] = __shfl_down(tmp[1], lane_delta, width);
399 
400  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
401  unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
402  return tmp1;
403  #else
404  static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
405  return static_cast<unsigned long>(__shfl_down(static_cast<unsigned int>(var), lane_delta, width));
406  #endif
407 }
408 __device__
409 inline
410 long long __shfl_down(long long var, unsigned int lane_delta, int width = warpSize)
411 {
412  static_assert(sizeof(long long) == 2 * sizeof(int), "");
413  static_assert(sizeof(long long) == sizeof(uint64_t), "");
414  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
415  tmp[0] = __shfl_down(tmp[0], lane_delta, width);
416  tmp[1] = __shfl_down(tmp[1], lane_delta, width);
417  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
418  long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
419  return tmp1;
420 }
421 __device__
422 inline
423 unsigned long long __shfl_down(unsigned long long var, unsigned int lane_delta, int width = warpSize)
424 {
425  static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
426  static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
427  unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
428  tmp[0] = __shfl_down(tmp[0], lane_delta, width);
429  tmp[1] = __shfl_down(tmp[1], lane_delta, width);
430  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
431  unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
432  return tmp1;
433 }
434 
435 __device__
436 inline
437 int __shfl_xor(int var, int lane_mask, int width = warpSize) {
438  int self = __lane_id();
439  int index = self^lane_mask;
440  index = index >= ((self+width)&~(width-1))?self:index;
441  return __builtin_amdgcn_ds_bpermute(index<<2, var);
442 }
443 __device__
444 inline
445 unsigned int __shfl_xor(unsigned int var, int lane_mask, int width = warpSize) {
446  union { int i; unsigned u; float f; } tmp; tmp.u = var;
447  tmp.i = __shfl_xor(tmp.i, lane_mask, width);
448  return tmp.u;
449 }
450 __device__
451 inline
452 float __shfl_xor(float var, int lane_mask, int width = warpSize) {
453  union { int i; unsigned u; float f; } tmp; tmp.f = var;
454  tmp.i = __shfl_xor(tmp.i, lane_mask, width);
455  return tmp.f;
456 }
457 __device__
458 inline
459 double __shfl_xor(double var, int lane_mask, int width = warpSize) {
460  static_assert(sizeof(double) == 2 * sizeof(int), "");
461  static_assert(sizeof(double) == sizeof(uint64_t), "");
462 
463  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
464  tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
465  tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
466 
467  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
468  double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
469  return tmp1;
470 }
471 __device__
472 inline
473 long __shfl_xor(long var, int lane_mask, int width = warpSize)
474 {
475  #ifndef _MSC_VER
476  static_assert(sizeof(long) == 2 * sizeof(int), "");
477  static_assert(sizeof(long) == sizeof(uint64_t), "");
478 
479  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
480  tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
481  tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
482 
483  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
484  long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
485  return tmp1;
486  #else
487  static_assert(sizeof(long) == sizeof(int), "");
488  return static_cast<long>(__shfl_xor(static_cast<int>(var), lane_mask, width));
489  #endif
490 }
491 __device__
492 inline
493 unsigned long __shfl_xor(unsigned long var, int lane_mask, int width = warpSize)
494 {
495  #ifndef _MSC_VER
496  static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
497  static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
498 
499  unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
500  tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
501  tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
502 
503  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
504  unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
505  return tmp1;
506  #else
507  static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
508  return static_cast<unsigned long>(__shfl_xor(static_cast<unsigned int>(var), lane_mask, width));
509  #endif
510 }
511 __device__
512 inline
513 long long __shfl_xor(long long var, int lane_mask, int width = warpSize)
514 {
515  static_assert(sizeof(long long) == 2 * sizeof(int), "");
516  static_assert(sizeof(long long) == sizeof(uint64_t), "");
517  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
518  tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
519  tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
520  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
521  long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
522  return tmp1;
523 }
524 __device__
525 inline
526 unsigned long long __shfl_xor(unsigned long long var, int lane_mask, int width = warpSize)
527 {
528  static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
529  static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
530  unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
531  tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
532  tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
533  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
534  unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
535  return tmp1;
536 }
537 
538 #endif