Point Cloud Library (PCL) 1.12.0
Loading...
Searching...
No Matches
utils.hpp
1/*
2 * Software License Agreement (BSD License)
3 *
4 * Point Cloud Library (PCL) - www.pointclouds.org
5 * Copyright (c) 2011, Willow Garage, Inc.
6 *
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 *
13 * * Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * * Redistributions in binary form must reproduce the above
16 * copyright notice, this list of conditions and the following
17 * disclaimer in the documentation and/or other materials provided
18 * with the distribution.
19 * * Neither the name of Willow Garage, Inc. nor the names of its
20 * contributors may be used to endorse or promote products derived
21 * from this software without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34 * POSSIBILITY OF SUCH DAMAGE.
35 *
36 */
37
38
39#ifndef PCL_GPU_KINFU_CUDA_UTILS_HPP_
40#define PCL_GPU_KINFU_CUDA_UTILS_HPP_
41
42#include <pcl/common/utils.h> // pcl::utils::ignore
43
44#include <limits>
45
46#include <cuda.h>
47
48namespace pcl
49{
50 namespace device
51 {
52 template <class T>
54 {
55 T c(a); a=b; b=c;
56 }
57
59 dot(const float3& v1, const float3& v2)
60 {
61 return v1.x * v2.x + v1.y*v2.y + v1.z*v2.z;
62 }
63
65 operator+=(float3& vec, const float& v)
66 {
67 vec.x += v; vec.y += v; vec.z += v; return vec;
68 }
69
71 operator+(const float3& v1, const float3& v2)
72 {
73 return make_float3(v1.x + v2.x, v1.y + v2.y, v1.z + v2.z);
74 }
75
77 operator*=(float3& vec, const float& v)
78 {
79 vec.x *= v; vec.y *= v; vec.z *= v; return vec;
80 }
81
83 operator-(const float3& v1, const float3& v2)
84 {
85 return make_float3(v1.x - v2.x, v1.y - v2.y, v1.z - v2.z);
86 }
87
89 operator*(const float3& v1, const float& v)
90 {
91 return make_float3(v1.x * v, v1.y * v, v1.z * v);
92 }
93
95 norm(const float3& v)
96 {
97 return sqrt(dot(v, v));
98 }
99
102 {
103 return v * rsqrt(dot(v, v));
104 }
105
107 cross(const float3& v1, const float3& v2)
108 {
109 return make_float3(v1.y * v2.z - v1.z * v2.y, v1.z * v2.x - v1.x * v2.z, v1.x * v2.y - v1.y * v2.x);
110 }
111
112 __device__ __forceinline__ void computeRoots2(const float& b, const float& c, float3& roots)
113 {
114 roots.x = 0.f;
115 float d = b * b - 4.f * c;
116 if (d < 0.f) // no real roots!!!! THIS SHOULD NOT HAPPEN!
117 d = 0.f;
118
119 float sd = sqrtf(d);
120
121 roots.z = 0.5f * (b + sd);
122 roots.y = 0.5f * (b - sd);
123 }
124
125 __device__ __forceinline__ void
126 computeRoots3(float c0, float c1, float c2, float3& roots)
127 {
128 if ( std::abs(c0) < std::numeric_limits<float>::epsilon())// one root is 0 -> quadratic equation
129 {
130 computeRoots2 (c2, c1, roots);
131 }
132 else
133 {
134 const float s_inv3 = 1.f/3.f;
135 const float s_sqrt3 = sqrtf(3.f);
136 // Construct the parameters used in classifying the roots of the equation
137 // and in solving the equation for the roots in closed form.
138 float c2_over_3 = c2 * s_inv3;
139 float a_over_3 = (c1 - c2*c2_over_3)*s_inv3;
140 if (a_over_3 > 0.f)
141 a_over_3 = 0.f;
142
143 float half_b = 0.5f * (c0 + c2_over_3 * (2.f * c2_over_3 * c2_over_3 - c1));
144
145 float q = half_b * half_b + a_over_3 * a_over_3 * a_over_3;
146 if (q > 0.f)
147 q = 0.f;
148
149 // Compute the eigenvalues by solving for the roots of the polynomial.
150 float rho = sqrtf(-a_over_3);
151 float theta = std::atan2 (sqrtf (-q), half_b)*s_inv3;
152 float cos_theta = __cosf (theta);
153 float sin_theta = __sinf (theta);
154 roots.x = c2_over_3 + 2.f * rho * cos_theta;
155 roots.y = c2_over_3 - rho * (cos_theta + s_sqrt3 * sin_theta);
156 roots.z = c2_over_3 - rho * (cos_theta - s_sqrt3 * sin_theta);
157
158 // Sort in increasing order.
159 if (roots.x >= roots.y)
160 swap(roots.x, roots.y);
161
162 if (roots.y >= roots.z)
163 {
164 swap(roots.y, roots.z);
165
166 if (roots.x >= roots.y)
167 swap (roots.x, roots.y);
168 }
169 if (roots.x <= 0) // eigenval for symmetric positive semi-definite matrix can not be negative! Set it to 0
170 computeRoots2 (c2, c1, roots);
171 }
172 }
173
174 struct Eigen33
175 {
176 public:
177 template<int Rows>
178 struct MiniMat
179 {
180 float3 data[Rows];
182 __device__ __host__ __forceinline__ const float3& operator[](int i) const { return data[i]; }
183 };
186
187
190 {
191 float3 perp;
192 /* Let us compute the crossed product of *this with a vector
193 * that is not too close to being colinear to *this.
194 */
195
196 /* unless the x and y coords are both close to zero, we can
197 * simply take ( -y, x, 0 ) and normalize it.
198 */
199 if(!isMuchSmallerThan(src.x, src.z) || !isMuchSmallerThan(src.y, src.z))
200 {
201 float invnm = rsqrtf(src.x*src.x + src.y*src.y);
202 perp.x = -src.y * invnm;
203 perp.y = src.x * invnm;
204 perp.z = 0.0f;
205 }
206 /* if both x and y are close to zero, then the vector is close
207 * to the z-axis, so it's far from colinear to the x-axis for instance.
208 * So we take the crossed product with (1,0,0) and normalize it.
209 */
210 else
211 {
212 float invnm = rsqrtf(src.z * src.z + src.y * src.y);
213 perp.x = 0.0f;
214 perp.y = -src.z * invnm;
215 perp.z = src.y * invnm;
216 }
217
218 return perp;
219 }
220
222 Eigen33(volatile float* mat_pkg_arg) : mat_pkg(mat_pkg_arg) {}
225 {
226 // Scale the matrix so its entries are in [-1,1]. The scaling is applied
227 // only when at least one matrix entry has magnitude larger than 1.
228
229 float max01 = fmaxf( std::abs(mat_pkg[0]), std::abs(mat_pkg[1]) );
230 float max23 = fmaxf( std::abs(mat_pkg[2]), std::abs(mat_pkg[3]) );
231 float max45 = fmaxf( std::abs(mat_pkg[4]), std::abs(mat_pkg[5]) );
232 float m0123 = fmaxf( max01, max23);
233 float scale = fmaxf( max45, m0123);
234
235 if (scale <= std::numeric_limits<float>::min())
236 scale = 1.f;
237
238 mat_pkg[0] /= scale;
239 mat_pkg[1] /= scale;
240 mat_pkg[2] /= scale;
241 mat_pkg[3] /= scale;
242 mat_pkg[4] /= scale;
243 mat_pkg[5] /= scale;
244
245 // The characteristic equation is x^3 - c2*x^2 + c1*x - c0 = 0. The
246 // eigenvalues are the roots to this equation, all guaranteed to be
247 // real-valued, because the matrix is symmetric.
248 float c0 = m00() * m11() * m22()
249 + 2.f * m01() * m02() * m12()
250 - m00() * m12() * m12()
251 - m11() * m02() * m02()
252 - m22() * m01() * m01();
253 float c1 = m00() * m11() -
254 m01() * m01() +
255 m00() * m22() -
256 m02() * m02() +
257 m11() * m22() -
258 m12() * m12();
259 float c2 = m00() + m11() + m22();
260
261 computeRoots3(c0, c1, c2, evals);
262
263 if(evals.z - evals.x <= std::numeric_limits<float>::epsilon())
264 {
265 evecs[0] = make_float3(1.f, 0.f, 0.f);
266 evecs[1] = make_float3(0.f, 1.f, 0.f);
267 evecs[2] = make_float3(0.f, 0.f, 1.f);
268 }
269 else if (evals.y - evals.x <= std::numeric_limits<float>::epsilon() )
270 {
271 // first and second equal
272 tmp[0] = row0(); tmp[1] = row1(); tmp[2] = row2();
273 tmp[0].x -= evals.z; tmp[1].y -= evals.z; tmp[2].z -= evals.z;
274
275 vec_tmp[0] = cross(tmp[0], tmp[1]);
276 vec_tmp[1] = cross(tmp[0], tmp[2]);
277 vec_tmp[2] = cross(tmp[1], tmp[2]);
278
279 float len1 = dot (vec_tmp[0], vec_tmp[0]);
280 float len2 = dot (vec_tmp[1], vec_tmp[1]);
281 float len3 = dot (vec_tmp[2], vec_tmp[2]);
282
283 if (len1 >= len2 && len1 >= len3)
284 {
285 evecs[2] = vec_tmp[0] * rsqrtf (len1);
286 }
287 else if (len2 >= len1 && len2 >= len3)
288 {
289 evecs[2] = vec_tmp[1] * rsqrtf (len2);
290 }
291 else
292 {
293 evecs[2] = vec_tmp[2] * rsqrtf (len3);
294 }
295
296 evecs[1] = unitOrthogonal(evecs[2]);
297 evecs[0] = cross(evecs[1], evecs[2]);
298 }
299 else if (evals.z - evals.y <= std::numeric_limits<float>::epsilon() )
300 {
301 // second and third equal
302 tmp[0] = row0(); tmp[1] = row1(); tmp[2] = row2();
303 tmp[0].x -= evals.x; tmp[1].y -= evals.x; tmp[2].z -= evals.x;
304
305 vec_tmp[0] = cross(tmp[0], tmp[1]);
306 vec_tmp[1] = cross(tmp[0], tmp[2]);
307 vec_tmp[2] = cross(tmp[1], tmp[2]);
308
309 float len1 = dot(vec_tmp[0], vec_tmp[0]);
310 float len2 = dot(vec_tmp[1], vec_tmp[1]);
311 float len3 = dot(vec_tmp[2], vec_tmp[2]);
312
313 if (len1 >= len2 && len1 >= len3)
314 {
315 evecs[0] = vec_tmp[0] * rsqrtf(len1);
316 }
317 else if (len2 >= len1 && len2 >= len3)
318 {
319 evecs[0] = vec_tmp[1] * rsqrtf(len2);
320 }
321 else
322 {
323 evecs[0] = vec_tmp[2] * rsqrtf(len3);
324 }
325
326 evecs[1] = unitOrthogonal( evecs[0] );
327 evecs[2] = cross(evecs[0], evecs[1]);
328 }
329 else
330 {
331
332 tmp[0] = row0(); tmp[1] = row1(); tmp[2] = row2();
333 tmp[0].x -= evals.z; tmp[1].y -= evals.z; tmp[2].z -= evals.z;
334
335 vec_tmp[0] = cross(tmp[0], tmp[1]);
336 vec_tmp[1] = cross(tmp[0], tmp[2]);
337 vec_tmp[2] = cross(tmp[1], tmp[2]);
338
339 float len1 = dot(vec_tmp[0], vec_tmp[0]);
340 float len2 = dot(vec_tmp[1], vec_tmp[1]);
341 float len3 = dot(vec_tmp[2], vec_tmp[2]);
342
343 float mmax[3];
344
345 unsigned int min_el = 2;
346 unsigned int max_el = 2;
347 if (len1 >= len2 && len1 >= len3)
348 {
349 mmax[2] = len1;
350 evecs[2] = vec_tmp[0] * rsqrtf (len1);
351 }
352 else if (len2 >= len1 && len2 >= len3)
353 {
354 mmax[2] = len2;
355 evecs[2] = vec_tmp[1] * rsqrtf (len2);
356 }
357 else
358 {
359 mmax[2] = len3;
360 evecs[2] = vec_tmp[2] * rsqrtf (len3);
361 }
362
363 tmp[0] = row0(); tmp[1] = row1(); tmp[2] = row2();
364 tmp[0].x -= evals.y; tmp[1].y -= evals.y; tmp[2].z -= evals.y;
365
366 vec_tmp[0] = cross(tmp[0], tmp[1]);
367 vec_tmp[1] = cross(tmp[0], tmp[2]);
368 vec_tmp[2] = cross(tmp[1], tmp[2]);
369
370 len1 = dot(vec_tmp[0], vec_tmp[0]);
371 len2 = dot(vec_tmp[1], vec_tmp[1]);
372 len3 = dot(vec_tmp[2], vec_tmp[2]);
373
374 if (len1 >= len2 && len1 >= len3)
375 {
376 mmax[1] = len1;
377 evecs[1] = vec_tmp[0] * rsqrtf (len1);
378 min_el = len1 <= mmax[min_el] ? 1 : min_el;
379 max_el = len1 > mmax[max_el] ? 1 : max_el;
380 }
381 else if (len2 >= len1 && len2 >= len3)
382 {
383 mmax[1] = len2;
384 evecs[1] = vec_tmp[1] * rsqrtf (len2);
385 min_el = len2 <= mmax[min_el] ? 1 : min_el;
386 max_el = len2 > mmax[max_el] ? 1 : max_el;
387 }
388 else
389 {
390 mmax[1] = len3;
391 evecs[1] = vec_tmp[2] * rsqrtf (len3);
392 min_el = len3 <= mmax[min_el] ? 1 : min_el;
393 max_el = len3 > mmax[max_el] ? 1 : max_el;
394 }
395
396 tmp[0] = row0(); tmp[1] = row1(); tmp[2] = row2();
397 tmp[0].x -= evals.x; tmp[1].y -= evals.x; tmp[2].z -= evals.x;
398
399 vec_tmp[0] = cross(tmp[0], tmp[1]);
400 vec_tmp[1] = cross(tmp[0], tmp[2]);
401 vec_tmp[2] = cross(tmp[1], tmp[2]);
402
403 len1 = dot (vec_tmp[0], vec_tmp[0]);
404 len2 = dot (vec_tmp[1], vec_tmp[1]);
405 len3 = dot (vec_tmp[2], vec_tmp[2]);
406
407
408 if (len1 >= len2 && len1 >= len3)
409 {
410 mmax[0] = len1;
411 evecs[0] = vec_tmp[0] * rsqrtf (len1);
412 min_el = len3 <= mmax[min_el] ? 0 : min_el;
413 max_el = len3 > mmax[max_el] ? 0 : max_el;
414 }
415 else if (len2 >= len1 && len2 >= len3)
416 {
417 mmax[0] = len2;
418 evecs[0] = vec_tmp[1] * rsqrtf (len2);
419 min_el = len3 <= mmax[min_el] ? 0 : min_el;
420 max_el = len3 > mmax[max_el] ? 0 : max_el;
421 }
422 else
423 {
424 mmax[0] = len3;
425 evecs[0] = vec_tmp[2] * rsqrtf (len3);
426 min_el = len3 <= mmax[min_el] ? 0 : min_el;
427 max_el = len3 > mmax[max_el] ? 0 : max_el;
428 }
429
430 unsigned mid_el = 3 - min_el - max_el;
431 evecs[min_el] = normalized( cross( evecs[(min_el+1) % 3], evecs[(min_el+2) % 3] ) );
432 evecs[mid_el] = normalized( cross( evecs[(mid_el+1) % 3], evecs[(mid_el+2) % 3] ) );
433 }
434 // Rescale back to the original size.
435 evals *= scale;
436 }
437 private:
438 volatile float* mat_pkg;
439
440 __device__ __forceinline__ float m00() const { return mat_pkg[0]; }
441 __device__ __forceinline__ float m01() const { return mat_pkg[1]; }
442 __device__ __forceinline__ float m02() const { return mat_pkg[2]; }
443 __device__ __forceinline__ float m10() const { return mat_pkg[1]; }
444 __device__ __forceinline__ float m11() const { return mat_pkg[3]; }
445 __device__ __forceinline__ float m12() const { return mat_pkg[4]; }
446 __device__ __forceinline__ float m20() const { return mat_pkg[2]; }
447 __device__ __forceinline__ float m21() const { return mat_pkg[4]; }
448 __device__ __forceinline__ float m22() const { return mat_pkg[5]; }
449
450 __device__ __forceinline__ float3 row0() const { return make_float3( m00(), m01(), m02() ); }
451 __device__ __forceinline__ float3 row1() const { return make_float3( m10(), m11(), m12() ); }
452 __device__ __forceinline__ float3 row2() const { return make_float3( m20(), m21(), m22() ); }
453
454 __device__ __forceinline__ static bool isMuchSmallerThan (float x, float y)
455 {
456 // copied from <eigen>/include/Eigen/src/Core/NumTraits.h
457 const float prec_sqr = std::numeric_limits<float>::epsilon() * std::numeric_limits<float>::epsilon();
458 return x * x <= prec_sqr * y * y;
459 }
460 };
461
462 struct Block
463 {
464 static __device__ __forceinline__ unsigned int stride()
465 {
466 return blockDim.x * blockDim.y * blockDim.z;
467 }
468
469 static __device__ __forceinline__ int
471 {
472 return threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
473 }
474
475 template<int CTA_SIZE, typename T, class BinOp>
476 static __device__ __forceinline__ void reduce(volatile T* buffer, BinOp op)
477 {
478 int tid = flattenedThreadId();
479 T val = buffer[tid];
480
481 if (CTA_SIZE >= 1024) { if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
482 if (CTA_SIZE >= 512) { if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
483 if (CTA_SIZE >= 256) { if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
484 if (CTA_SIZE >= 128) { if (tid < 64) buffer[tid] = val = op(val, buffer[tid + 64]); __syncthreads(); }
485
486 if (tid < 32)
487 {
488 if (CTA_SIZE >= 64) { buffer[tid] = val = op(val, buffer[tid + 32]); }
489 if (CTA_SIZE >= 32) { buffer[tid] = val = op(val, buffer[tid + 16]); }
490 if (CTA_SIZE >= 16) { buffer[tid] = val = op(val, buffer[tid + 8]); }
491 if (CTA_SIZE >= 8) { buffer[tid] = val = op(val, buffer[tid + 4]); }
492 if (CTA_SIZE >= 4) { buffer[tid] = val = op(val, buffer[tid + 2]); }
493 if (CTA_SIZE >= 2) { buffer[tid] = val = op(val, buffer[tid + 1]); }
494 }
495 }
496
497 template<int CTA_SIZE, typename T, class BinOp>
498 static __device__ __forceinline__ T reduce(volatile T* buffer, T init, BinOp op)
499 {
500 int tid = flattenedThreadId();
501 T val = buffer[tid] = init;
503
504 if (CTA_SIZE >= 1024) { if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
505 if (CTA_SIZE >= 512) { if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
506 if (CTA_SIZE >= 256) { if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
507 if (CTA_SIZE >= 128) { if (tid < 64) buffer[tid] = val = op(val, buffer[tid + 64]); __syncthreads(); }
508
509 if (tid < 32)
510 {
511 if (CTA_SIZE >= 64) { buffer[tid] = val = op(val, buffer[tid + 32]); }
512 if (CTA_SIZE >= 32) { buffer[tid] = val = op(val, buffer[tid + 16]); }
513 if (CTA_SIZE >= 16) { buffer[tid] = val = op(val, buffer[tid + 8]); }
514 if (CTA_SIZE >= 8) { buffer[tid] = val = op(val, buffer[tid + 4]); }
515 if (CTA_SIZE >= 4) { buffer[tid] = val = op(val, buffer[tid + 2]); }
516 if (CTA_SIZE >= 2) { buffer[tid] = val = op(val, buffer[tid + 1]); }
517 }
518 __syncthreads();
519 return buffer[0];
520 }
521 };
522
523 struct Warp
524 {
525 enum
526 {
530 };
531
532 /** \brief Returns the warp lane ID of the calling thread. */
533 static __device__ __forceinline__ unsigned int
535 {
536 unsigned int ret;
537 asm("mov.u32 %0, %laneid;" : "=r"(ret) );
538 return ret;
539 }
540
541 static __device__ __forceinline__ unsigned int id()
542 {
543 int tid = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
544 return tid >> LOG_WARP_SIZE;
545 }
546
549 {
550 unsigned int ret;
551 asm("mov.u32 %0, %lanemask_lt;" : "=r"(ret) );
552 return ret;
553 }
554
559 };
560
561
563 {
564 static __device__ __forceinline__ int
565 warp_reduce ( volatile int *ptr , const unsigned int tid)
566 {
567 const unsigned int lane = tid & 31; // index of thread in warp (0..31)
568
569 if (lane < 16)
570 {
571 int partial = ptr[tid];
572
573 ptr[tid] = partial = partial + ptr[tid + 16];
574 ptr[tid] = partial = partial + ptr[tid + 8];
575 ptr[tid] = partial = partial + ptr[tid + 4];
576 ptr[tid] = partial = partial + ptr[tid + 2];
577 ptr[tid] = partial = partial + ptr[tid + 1];
578 }
579 return ptr[tid - lane];
580 }
581
582 static __forceinline__ __device__ int
583 Ballot(int predicate, volatile int* cta_buffer)
584 {
586#if CUDA_VERSION >= 9000
588#else
589 return __ballot(predicate);
590#endif
591 }
592
593 static __forceinline__ __device__ bool
594 All(int predicate, volatile int* cta_buffer)
595 {
597#if CUDA_VERSION >= 9000
598 return __all_sync (__activemask (), predicate);
599#else
600 return __all(predicate);
601#endif
602 }
603 };
604 }
605}
606
607#endif /* PCL_GPU_KINFU_CUDA_UTILS_HPP_ */
Iterator class for point clouds with or without given indices.
__device__ __forceinline__ float3 operator+(const float3 &v1, const float3 &v2)
Definition utils.hpp:71
__device__ __forceinline__ float3 operator*(const Mat33 &m, const float3 &vec)
Definition device.hpp:74
__device__ __forceinline__ float3 operator-(const float3 &v1, const float3 &v2)
Definition utils.hpp:83
__device__ __forceinline__ float3 normalized(const float3 &v)
Definition utils.hpp:101
__device__ __forceinline__ float dot(const float3 &v1, const float3 &v2)
Definition utils.hpp:59
__device__ __forceinline__ void computeRoots3(float c0, float c1, float c2, float3 &roots)
Definition eigen.hpp:115
__device__ __host__ __forceinline__ float norm(const float3 &v1, const float3 &v2)
__device__ __forceinline__ float3 & operator+=(float3 &vec, const float &v)
Definition utils.hpp:65
__device__ __forceinline__ void computeRoots2(const float &b, const float &c, float3 &roots)
Definition eigen.hpp:102
__device__ __forceinline__ float3 & operator*=(float3 &vec, const float &v)
Definition utils.hpp:77
__device__ __host__ __forceinline__ void swap(T &a, T &b)
Definition utils.hpp:53
__device__ __host__ __forceinline__ float3 cross(const float3 &v1, const float3 &v2)
Definition utils.hpp:107
void ignore(const T &...)
Utility function to eliminate unused variable warnings.
Definition utils.h:62
static __device__ __forceinline__ void reduce(volatile T *buffer, BinOp op)
Definition utils.hpp:476
static __device__ __forceinline__ T reduce(volatile T *buffer, T init, BinOp op)
Definition utils.hpp:498
static __device__ __forceinline__ unsigned int stride()
Definition utils.hpp:464
static __device__ __forceinline__ int flattenedThreadId()
Definition utils.hpp:470
__device__ __host__ __forceinline__ float3 & operator[](int i)
Definition utils.hpp:181
__device__ __host__ __forceinline__ const float3 & operator[](int i) const
Definition utils.hpp:182
__device__ __forceinline__ void compute(Mat33 &tmp, Mat33 &vec_tmp, Mat33 &evecs, float3 &evals)
Definition utils.hpp:224
static __forceinline__ __device__ float3 unitOrthogonal(const float3 &src)
Definition utils.hpp:189
__device__ __forceinline__ Eigen33(volatile float *mat_pkg_arg)
Definition utils.hpp:222
static __forceinline__ __device__ int Ballot(int predicate, volatile int *cta_buffer)
Definition utils.hpp:583
static __forceinline__ __device__ bool All(int predicate, volatile int *cta_buffer)
Definition utils.hpp:594
static __device__ __forceinline__ int warp_reduce(volatile int *ptr, const unsigned int tid)
Definition utils.hpp:565
static __device__ __forceinline__ unsigned int laneId()
Returns the warp lane ID of the calling thread.
Definition utils.hpp:534
static __device__ __forceinline__ int binaryExclScan(int ballot_mask)
Definition utils.hpp:555
static __device__ __forceinline__ int laneMaskLt()
Definition utils.hpp:548
static __device__ __forceinline__ unsigned int id()
Definition utils.hpp:541