Vector Optimized Library of Kernels  3.2.0
Architecture-tuned implementations of math kernels
volk_sse_intrinsics.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2015 Free Software Foundation, Inc.
4  * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
5  *
6  * This file is part of VOLK
7  *
8  * SPDX-License-Identifier: LGPL-3.0-or-later
9  */
10 
11 /*
12  * This file is intended to hold SSE intrinsics of intrinsics.
13  * They should be used in VOLK kernels to avoid copy-pasta.
14  */
15 
16 #ifndef INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_
17 #define INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_
18 #include <xmmintrin.h>
19 
20 /*
21  * Approximate arctan(x) via polynomial expansion
22  * on the interval [-1, 1]
23  *
24  * Maximum relative error ~6.5e-7
25  * Polynomial evaluated via Horner's method
26  */
27 static inline __m128 _mm_arctan_poly_sse(const __m128 x)
28 {
29  const __m128 a1 = _mm_set1_ps(+0x1.ffffeap-1f);
30  const __m128 a3 = _mm_set1_ps(-0x1.55437p-2f);
31  const __m128 a5 = _mm_set1_ps(+0x1.972be6p-3f);
32  const __m128 a7 = _mm_set1_ps(-0x1.1436ap-3f);
33  const __m128 a9 = _mm_set1_ps(+0x1.5785aap-4f);
34  const __m128 a11 = _mm_set1_ps(-0x1.2f3004p-5f);
35  const __m128 a13 = _mm_set1_ps(+0x1.01a37cp-7f);
36 
37  const __m128 x_times_x = _mm_mul_ps(x, x);
38  __m128 arctan;
39  arctan = a13;
40  arctan = _mm_mul_ps(x_times_x, arctan);
41  arctan = _mm_add_ps(arctan, a11);
42  arctan = _mm_mul_ps(x_times_x, arctan);
43  arctan = _mm_add_ps(arctan, a9);
44  arctan = _mm_mul_ps(x_times_x, arctan);
45  arctan = _mm_add_ps(arctan, a7);
46  arctan = _mm_mul_ps(x_times_x, arctan);
47  arctan = _mm_add_ps(arctan, a5);
48  arctan = _mm_mul_ps(x_times_x, arctan);
49  arctan = _mm_add_ps(arctan, a3);
50  arctan = _mm_mul_ps(x_times_x, arctan);
51  arctan = _mm_add_ps(arctan, a1);
52  arctan = _mm_mul_ps(x, arctan);
53 
54  return arctan;
55 }
56 
57 static inline __m128 _mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2)
58 {
59  __m128 iValue, qValue;
60  // Arrange in i1i2i3i4 format
61  iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
62  // Arrange in q1q2q3q4 format
63  qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
64  iValue = _mm_mul_ps(iValue, iValue); // Square the I values
65  qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
66  return _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
67 }
68 
69 static inline __m128 _mm_magnitude_ps(__m128 cplxValue1, __m128 cplxValue2)
70 {
71  return _mm_sqrt_ps(_mm_magnitudesquared_ps(cplxValue1, cplxValue2));
72 }
73 
74 static inline __m128 _mm_scaled_norm_dist_ps_sse(const __m128 symbols0,
75  const __m128 symbols1,
76  const __m128 points0,
77  const __m128 points1,
78  const __m128 scalar)
79 {
80  // calculate scalar * |x - y|^2
81  const __m128 diff0 = _mm_sub_ps(symbols0, points0);
82  const __m128 diff1 = _mm_sub_ps(symbols1, points1);
83  const __m128 norms = _mm_magnitudesquared_ps(diff0, diff1);
84  return _mm_mul_ps(norms, scalar);
85 }
86 
87 static inline __m128 _mm_accumulate_square_sum_ps(
88  __m128 sq_acc, __m128 acc, __m128 val, __m128 rec, __m128 aux)
89 {
90  aux = _mm_mul_ps(aux, val);
91  aux = _mm_sub_ps(aux, acc);
92  aux = _mm_mul_ps(aux, aux);
93  aux = _mm_mul_ps(aux, rec);
94  return _mm_add_ps(sq_acc, aux);
95 }
96 
97 #endif /* INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_ */
val
Definition: volk_arch_defs.py:57
static __m128 _mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2)
Definition: volk_sse_intrinsics.h:57
static __m128 _mm_arctan_poly_sse(const __m128 x)
Definition: volk_sse_intrinsics.h:27
static __m128 _mm_accumulate_square_sum_ps(__m128 sq_acc, __m128 acc, __m128 val, __m128 rec, __m128 aux)
Definition: volk_sse_intrinsics.h:87
static __m128 _mm_scaled_norm_dist_ps_sse(const __m128 symbols0, const __m128 symbols1, const __m128 points0, const __m128 points1, const __m128 scalar)
Definition: volk_sse_intrinsics.h:74
static __m128 _mm_magnitude_ps(__m128 cplxValue1, __m128 cplxValue2)
Definition: volk_sse_intrinsics.h:69