39#ifndef PCL_GPU_KINFU_CUDA_UTILS_HPP_
40#define PCL_GPU_KINFU_CUDA_UTILS_HPP_
42#include <pcl/common/utils.h>
115 float d = b * b - 4.f *
c;
125 __device__ __forceinline__
void
128 if ( std::abs(c0) < std::numeric_limits<float>::epsilon())
134 const float s_inv3 = 1.f/3.f;
135 const float s_sqrt3 = sqrtf(3.f);
138 float c2_over_3 = c2 * s_inv3;
139 float a_over_3 = (c1 - c2*c2_over_3)*s_inv3;
143 float half_b = 0.5f * (c0 + c2_over_3 * (2.f * c2_over_3 * c2_over_3 - c1));
145 float q = half_b * half_b + a_over_3 * a_over_3 * a_over_3;
150 float rho = sqrtf(-a_over_3);
151 float theta = std::atan2 (sqrtf (-q), half_b)*s_inv3;
152 float cos_theta = __cosf (theta);
153 float sin_theta = __sinf (theta);
154 roots.x = c2_over_3 + 2.f * rho * cos_theta;
155 roots.y = c2_over_3 - rho * (cos_theta + s_sqrt3 * sin_theta);
156 roots.z = c2_over_3 - rho * (cos_theta - s_sqrt3 * sin_theta);
159 if (roots.x >= roots.y)
160 swap(roots.x, roots.y);
162 if (roots.y >= roots.z)
164 swap(roots.y, roots.z);
166 if (roots.x >= roots.y)
167 swap (roots.x, roots.y);
199 if(!isMuchSmallerThan(
src.x,
src.z) || !isMuchSmallerThan(
src.y,
src.z))
229 float max01 =
fmaxf( std::abs(mat_pkg[0]), std::abs(mat_pkg[1]) );
230 float max23 =
fmaxf( std::abs(mat_pkg[2]), std::abs(mat_pkg[3]) );
231 float max45 =
fmaxf( std::abs(mat_pkg[4]), std::abs(mat_pkg[5]) );
235 if (scale <= std::numeric_limits<float>::min())
248 float c0 = m00() * m11() * m22()
249 + 2.f * m01() * m02() * m12()
250 - m00() * m12() * m12()
251 - m11() * m02() * m02()
252 - m22() * m01() * m01();
253 float c1 = m00() * m11() -
259 float c2 = m00() + m11() + m22();
263 if(
evals.z -
evals.x <= std::numeric_limits<float>::epsilon())
269 else if (
evals.y -
evals.x <= std::numeric_limits<float>::epsilon() )
272 tmp[0] = row0();
tmp[1] = row1();
tmp[2] = row2();
299 else if (
evals.z -
evals.y <= std::numeric_limits<float>::epsilon() )
302 tmp[0] = row0();
tmp[1] = row1();
tmp[2] = row2();
332 tmp[0] = row0();
tmp[1] = row1();
tmp[2] = row2();
363 tmp[0] = row0();
tmp[1] = row1();
tmp[2] = row2();
396 tmp[0] = row0();
tmp[1] = row1();
tmp[2] = row2();
438 volatile float* mat_pkg;
442 __device__ __forceinline__
float m02()
const {
return mat_pkg[2]; }
443 __device__ __forceinline__
float m10()
const {
return mat_pkg[1]; }
444 __device__ __forceinline__
float m11()
const {
return mat_pkg[3]; }
445 __device__ __forceinline__
float m12()
const {
return mat_pkg[4]; }
446 __device__ __forceinline__
float m20()
const {
return mat_pkg[2]; }
447 __device__ __forceinline__
float m21()
const {
return mat_pkg[4]; }
448 __device__ __forceinline__
float m22()
const {
return mat_pkg[5]; }
450 __device__ __forceinline__ float3 row0()
const {
return make_float3( m00(), m01(), m02() ); }
451 __device__ __forceinline__ float3 row1()
const {
return make_float3( m10(), m11(), m12() ); }
452 __device__ __forceinline__ float3 row2()
const {
return make_float3( m20(), m21(), m22() ); }
454 __device__ __forceinline__
static bool isMuchSmallerThan (
float x,
float y)
457 const float prec_sqr = std::numeric_limits<float>::epsilon() * std::numeric_limits<float>::epsilon();
458 return x * x <= prec_sqr * y * y;
475 template<
int CTA_SIZE,
typename T,
class BinOp>
497 template<
int CTA_SIZE,
typename T,
class BinOp>
537 asm(
"mov.u32 %0, %laneid;" :
"=r"(
ret) );
551 asm(
"mov.u32 %0, %lanemask_lt;" :
"=r"(
ret) );
567 const unsigned int lane =
tid & 31;
586#if CUDA_VERSION >= 9000
597#if CUDA_VERSION >= 9000
Iterator class for point clouds with or without given indices.
__device__ __forceinline__ float3 operator+(const float3 &v1, const float3 &v2)
__device__ __forceinline__ float3 operator*(const Mat33 &m, const float3 &vec)
__device__ __forceinline__ float3 operator-(const float3 &v1, const float3 &v2)
__device__ __forceinline__ float3 normalized(const float3 &v)
__device__ __forceinline__ float dot(const float3 &v1, const float3 &v2)
__device__ __forceinline__ void computeRoots3(float c0, float c1, float c2, float3 &roots)
__device__ __host__ __forceinline__ float norm(const float3 &v1, const float3 &v2)
__device__ __forceinline__ float3 & operator+=(float3 &vec, const float &v)
__device__ __forceinline__ void computeRoots2(const float &b, const float &c, float3 &roots)
__device__ __forceinline__ float3 & operator*=(float3 &vec, const float &v)
__device__ __host__ __forceinline__ void swap(T &a, T &b)
__device__ __host__ __forceinline__ float3 cross(const float3 &v1, const float3 &v2)
void ignore(const T &...)
Utility function to eliminate unused variable warnings.
static __device__ __forceinline__ void reduce(volatile T *buffer, BinOp op)
static __device__ __forceinline__ T reduce(volatile T *buffer, T init, BinOp op)
static __device__ __forceinline__ unsigned int stride()
static __device__ __forceinline__ int flattenedThreadId()
__device__ __host__ __forceinline__ float3 & operator[](int i)
__device__ __host__ __forceinline__ const float3 & operator[](int i) const
__device__ __forceinline__ void compute(Mat33 &tmp, Mat33 &vec_tmp, Mat33 &evecs, float3 &evals)
static __forceinline__ __device__ float3 unitOrthogonal(const float3 &src)
__device__ __forceinline__ Eigen33(volatile float *mat_pkg_arg)
static __forceinline__ __device__ int Ballot(int predicate, volatile int *cta_buffer)
static __forceinline__ __device__ bool All(int predicate, volatile int *cta_buffer)
static __device__ __forceinline__ int warp_reduce(volatile int *ptr, const unsigned int tid)
static __device__ __forceinline__ unsigned int laneId()
Returns the warp lane ID of the calling thread.
static __device__ __forceinline__ int binaryExclScan(int ballot_mask)
static __device__ __forceinline__ int laneMaskLt()
static __device__ __forceinline__ unsigned int id()