HIP: Heterogenous-computing Interface for Portability
Loading...
Searching...
No Matches
amd_warp_functions.h
1/*
2Copyright (c) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved.
3
4Permission is hereby granted, free of charge, to any person obtaining a copy
5of this software and associated documentation files (the "Software"), to deal
6in the Software without restriction, including without limitation the rights
7to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8copies of the Software, and to permit persons to whom the Software is
9furnished to do so, subject to the following conditions:
10
11The above copyright notice and this permission notice shall be included in
12all copies or substantial portions of the Software.
13
14THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20THE SOFTWARE.
21*/
22
23#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H
24#define HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H
25
26__device__ static inline unsigned __hip_ds_bpermute(int index, unsigned src) {
27 union { int i; unsigned u; float f; } tmp; tmp.u = src;
28 tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
29 return tmp.u;
30}
31
32__device__ static inline float __hip_ds_bpermutef(int index, float src) {
33 union { int i; unsigned u; float f; } tmp; tmp.f = src;
34 tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
35 return tmp.f;
36}
37
38__device__ static inline unsigned __hip_ds_permute(int index, unsigned src) {
39 union { int i; unsigned u; float f; } tmp; tmp.u = src;
40 tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
41 return tmp.u;
42}
43
44__device__ static inline float __hip_ds_permutef(int index, float src) {
45 union { int i; unsigned u; float f; } tmp; tmp.f = src;
46 tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
47 return tmp.f;
48}
49
50#define __hip_ds_swizzle(src, pattern) __hip_ds_swizzle_N<(pattern)>((src))
51#define __hip_ds_swizzlef(src, pattern) __hip_ds_swizzlef_N<(pattern)>((src))
52
53template <int pattern>
54__device__ static inline unsigned __hip_ds_swizzle_N(unsigned int src) {
55 union { int i; unsigned u; float f; } tmp; tmp.u = src;
56 tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
57 return tmp.u;
58}
59
60template <int pattern>
61__device__ static inline float __hip_ds_swizzlef_N(float src) {
62 union { int i; unsigned u; float f; } tmp; tmp.f = src;
63 tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
64 return tmp.f;
65}
66
67#define __hip_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl) \
68 __hip_move_dpp_N<(dpp_ctrl), (row_mask), (bank_mask), (bound_ctrl)>((src))
69
70template <int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl>
71__device__ static inline int __hip_move_dpp_N(int src) {
72 return __builtin_amdgcn_mov_dpp(src, dpp_ctrl, row_mask, bank_mask,
73 bound_ctrl);
74}
75
76__device__
77static constexpr int warpSize = __AMDGCN_WAVEFRONT_SIZE;
78
79// warp vote function __all __any __ballot
80__device__
81inline
82int __all(int predicate) {
83 return __ockl_wfall_i32(predicate);
84}
85
86__device__
87inline
88int __any(int predicate) {
89 return __ockl_wfany_i32(predicate);
90}
91
92// XXX from llvm/include/llvm/IR/InstrTypes.h
93#define ICMP_NE 33
94
95__device__
96inline
97unsigned long long int __ballot(int predicate) {
98 return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
99}
100
101__device__
102inline
103unsigned long long int __ballot64(int predicate) {
104 return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
105}
106
107// See amd_warp_sync_functions.h for an explanation of this preprocessor flag.
108#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
109// Since threads in a wave do not make independent progress, __activemask()
110// always returns the exact active mask, i.e, all active threads in the wave.
111__device__
112inline
113unsigned long long __activemask() {
114 return __ballot(true);
115}
116#endif // HIP_ENABLE_WARP_SYNC_BUILTINS
117
118__device__ static inline unsigned int __lane_id() {
119 return __builtin_amdgcn_mbcnt_hi(
120 -1, __builtin_amdgcn_mbcnt_lo(-1, 0));
121}
122
123__device__
124inline
125int __shfl(int var, int src_lane, int width = warpSize) {
126 int self = __lane_id();
127 int index = (src_lane & (width - 1)) + (self & ~(width-1));
128 return __builtin_amdgcn_ds_bpermute(index<<2, var);
129}
130__device__
131inline
132unsigned int __shfl(unsigned int var, int src_lane, int width = warpSize) {
133 union { int i; unsigned u; float f; } tmp; tmp.u = var;
134 tmp.i = __shfl(tmp.i, src_lane, width);
135 return tmp.u;
136}
137__device__
138inline
139float __shfl(float var, int src_lane, int width = warpSize) {
140 union { int i; unsigned u; float f; } tmp; tmp.f = var;
141 tmp.i = __shfl(tmp.i, src_lane, width);
142 return tmp.f;
143}
144__device__
145inline
146double __shfl(double var, int src_lane, int width = warpSize) {
147 static_assert(sizeof(double) == 2 * sizeof(int), "");
148 static_assert(sizeof(double) == sizeof(uint64_t), "");
149
150 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
151 tmp[0] = __shfl(tmp[0], src_lane, width);
152 tmp[1] = __shfl(tmp[1], src_lane, width);
153
154 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
155 double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
156 return tmp1;
157}
158__device__
159inline
160long __shfl(long var, int src_lane, int width = warpSize)
161{
162 #ifndef _MSC_VER
163 static_assert(sizeof(long) == 2 * sizeof(int), "");
164 static_assert(sizeof(long) == sizeof(uint64_t), "");
165
166 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
167 tmp[0] = __shfl(tmp[0], src_lane, width);
168 tmp[1] = __shfl(tmp[1], src_lane, width);
169
170 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
171 long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
172 return tmp1;
173 #else
174 static_assert(sizeof(long) == sizeof(int), "");
175 return static_cast<long>(__shfl(static_cast<int>(var), src_lane, width));
176 #endif
177}
178__device__
179inline
180unsigned long __shfl(unsigned long var, int src_lane, int width = warpSize) {
181 #ifndef _MSC_VER
182 static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
183 static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
184
185 unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
186 tmp[0] = __shfl(tmp[0], src_lane, width);
187 tmp[1] = __shfl(tmp[1], src_lane, width);
188
189 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
190 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
191 return tmp1;
192 #else
193 static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
194 return static_cast<unsigned long>(__shfl(static_cast<unsigned int>(var), src_lane, width));
195 #endif
196}
197__device__
198inline
199long long __shfl(long long var, int src_lane, int width = warpSize)
200{
201 static_assert(sizeof(long long) == 2 * sizeof(int), "");
202 static_assert(sizeof(long long) == sizeof(uint64_t), "");
203
204 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
205 tmp[0] = __shfl(tmp[0], src_lane, width);
206 tmp[1] = __shfl(tmp[1], src_lane, width);
207
208 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
209 long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
210 return tmp1;
211}
212__device__
213inline
214unsigned long long __shfl(unsigned long long var, int src_lane, int width = warpSize) {
215 static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
216 static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
217
218 unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
219 tmp[0] = __shfl(tmp[0], src_lane, width);
220 tmp[1] = __shfl(tmp[1], src_lane, width);
221
222 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
223 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
224 return tmp1;
225}
226
227__device__
228inline
229int __shfl_up(int var, unsigned int lane_delta, int width = warpSize) {
230 int self = __lane_id();
231 int index = self - lane_delta;
232 index = (index < (self & ~(width-1)))?self:index;
233 return __builtin_amdgcn_ds_bpermute(index<<2, var);
234}
235__device__
236inline
237unsigned int __shfl_up(unsigned int var, unsigned int lane_delta, int width = warpSize) {
238 union { int i; unsigned u; float f; } tmp; tmp.u = var;
239 tmp.i = __shfl_up(tmp.i, lane_delta, width);
240 return tmp.u;
241}
242__device__
243inline
244float __shfl_up(float var, unsigned int lane_delta, int width = warpSize) {
245 union { int i; unsigned u; float f; } tmp; tmp.f = var;
246 tmp.i = __shfl_up(tmp.i, lane_delta, width);
247 return tmp.f;
248}
249__device__
250inline
251double __shfl_up(double var, unsigned int lane_delta, int width = warpSize) {
252 static_assert(sizeof(double) == 2 * sizeof(int), "");
253 static_assert(sizeof(double) == sizeof(uint64_t), "");
254
255 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
256 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
257 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
258
259 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
260 double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
261 return tmp1;
262}
263__device__
264inline
265long __shfl_up(long var, unsigned int lane_delta, int width = warpSize)
266{
267 #ifndef _MSC_VER
268 static_assert(sizeof(long) == 2 * sizeof(int), "");
269 static_assert(sizeof(long) == sizeof(uint64_t), "");
270
271 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
272 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
273 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
274
275 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
276 long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
277 return tmp1;
278 #else
279 static_assert(sizeof(long) == sizeof(int), "");
280 return static_cast<long>(__shfl_up(static_cast<int>(var), lane_delta, width));
281 #endif
282}
283
284__device__
285inline
286unsigned long __shfl_up(unsigned long var, unsigned int lane_delta, int width = warpSize)
287{
288 #ifndef _MSC_VER
289 static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
290 static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
291
292 unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
293 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
294 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
295
296 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
297 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
298 return tmp1;
299 #else
300 static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
301 return static_cast<unsigned long>(__shfl_up(static_cast<unsigned int>(var), lane_delta, width));
302 #endif
303}
304
305__device__
306inline
307long long __shfl_up(long long var, unsigned int lane_delta, int width = warpSize)
308{
309 static_assert(sizeof(long long) == 2 * sizeof(int), "");
310 static_assert(sizeof(long long) == sizeof(uint64_t), "");
311 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
312 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
313 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
314 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
315 long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
316 return tmp1;
317}
318
319__device__
320inline
321unsigned long long __shfl_up(unsigned long long var, unsigned int lane_delta, int width = warpSize)
322{
323 static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
324 static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
325 unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
326 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
327 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
328 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
329 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
330 return tmp1;
331}
332
333__device__
334inline
335int __shfl_down(int var, unsigned int lane_delta, int width = warpSize) {
336 int self = __lane_id();
337 int index = self + lane_delta;
338 index = (int)((self&(width-1))+lane_delta) >= width?self:index;
339 return __builtin_amdgcn_ds_bpermute(index<<2, var);
340}
341__device__
342inline
343unsigned int __shfl_down(unsigned int var, unsigned int lane_delta, int width = warpSize) {
344 union { int i; unsigned u; float f; } tmp; tmp.u = var;
345 tmp.i = __shfl_down(tmp.i, lane_delta, width);
346 return tmp.u;
347}
348__device__
349inline
350float __shfl_down(float var, unsigned int lane_delta, int width = warpSize) {
351 union { int i; unsigned u; float f; } tmp; tmp.f = var;
352 tmp.i = __shfl_down(tmp.i, lane_delta, width);
353 return tmp.f;
354}
355__device__
356inline
357double __shfl_down(double var, unsigned int lane_delta, int width = warpSize) {
358 static_assert(sizeof(double) == 2 * sizeof(int), "");
359 static_assert(sizeof(double) == sizeof(uint64_t), "");
360
361 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
362 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
363 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
364
365 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
366 double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
367 return tmp1;
368}
369__device__
370inline
371long __shfl_down(long var, unsigned int lane_delta, int width = warpSize)
372{
373 #ifndef _MSC_VER
374 static_assert(sizeof(long) == 2 * sizeof(int), "");
375 static_assert(sizeof(long) == sizeof(uint64_t), "");
376
377 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
378 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
379 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
380
381 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
382 long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
383 return tmp1;
384 #else
385 static_assert(sizeof(long) == sizeof(int), "");
386 return static_cast<long>(__shfl_down(static_cast<int>(var), lane_delta, width));
387 #endif
388}
389__device__
390inline
391unsigned long __shfl_down(unsigned long var, unsigned int lane_delta, int width = warpSize)
392{
393 #ifndef _MSC_VER
394 static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
395 static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
396
397 unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
398 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
399 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
400
401 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
402 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
403 return tmp1;
404 #else
405 static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
406 return static_cast<unsigned long>(__shfl_down(static_cast<unsigned int>(var), lane_delta, width));
407 #endif
408}
409__device__
410inline
411long long __shfl_down(long long var, unsigned int lane_delta, int width = warpSize)
412{
413 static_assert(sizeof(long long) == 2 * sizeof(int), "");
414 static_assert(sizeof(long long) == sizeof(uint64_t), "");
415 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
416 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
417 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
418 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
419 long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
420 return tmp1;
421}
422__device__
423inline
424unsigned long long __shfl_down(unsigned long long var, unsigned int lane_delta, int width = warpSize)
425{
426 static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
427 static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
428 unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
429 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
430 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
431 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
432 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
433 return tmp1;
434}
435
436__device__
437inline
438int __shfl_xor(int var, int lane_mask, int width = warpSize) {
439 int self = __lane_id();
440 int index = self^lane_mask;
441 index = index >= ((self+width)&~(width-1))?self:index;
442 return __builtin_amdgcn_ds_bpermute(index<<2, var);
443}
444__device__
445inline
446unsigned int __shfl_xor(unsigned int var, int lane_mask, int width = warpSize) {
447 union { int i; unsigned u; float f; } tmp; tmp.u = var;
448 tmp.i = __shfl_xor(tmp.i, lane_mask, width);
449 return tmp.u;
450}
451__device__
452inline
453float __shfl_xor(float var, int lane_mask, int width = warpSize) {
454 union { int i; unsigned u; float f; } tmp; tmp.f = var;
455 tmp.i = __shfl_xor(tmp.i, lane_mask, width);
456 return tmp.f;
457}
458__device__
459inline
460double __shfl_xor(double var, int lane_mask, int width = warpSize) {
461 static_assert(sizeof(double) == 2 * sizeof(int), "");
462 static_assert(sizeof(double) == sizeof(uint64_t), "");
463
464 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
465 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
466 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
467
468 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
469 double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
470 return tmp1;
471}
472__device__
473inline
474long __shfl_xor(long var, int lane_mask, int width = warpSize)
475{
476 #ifndef _MSC_VER
477 static_assert(sizeof(long) == 2 * sizeof(int), "");
478 static_assert(sizeof(long) == sizeof(uint64_t), "");
479
480 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
481 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
482 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
483
484 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
485 long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
486 return tmp1;
487 #else
488 static_assert(sizeof(long) == sizeof(int), "");
489 return static_cast<long>(__shfl_xor(static_cast<int>(var), lane_mask, width));
490 #endif
491}
492__device__
493inline
494unsigned long __shfl_xor(unsigned long var, int lane_mask, int width = warpSize)
495{
496 #ifndef _MSC_VER
497 static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
498 static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
499
500 unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
501 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
502 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
503
504 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
505 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
506 return tmp1;
507 #else
508 static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
509 return static_cast<unsigned long>(__shfl_xor(static_cast<unsigned int>(var), lane_mask, width));
510 #endif
511}
512__device__
513inline
514long long __shfl_xor(long long var, int lane_mask, int width = warpSize)
515{
516 static_assert(sizeof(long long) == 2 * sizeof(int), "");
517 static_assert(sizeof(long long) == sizeof(uint64_t), "");
518 int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
519 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
520 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
521 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
522 long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
523 return tmp1;
524}
525__device__
526inline
527unsigned long long __shfl_xor(unsigned long long var, int lane_mask, int width = warpSize)
528{
529 static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
530 static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
531 unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
532 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
533 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
534 uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
535 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
536 return tmp1;
537}
538
539#endif