Vector Optimized Library of Kernels  3.2.0
Architecture-tuned implementations of math kernels
volk_32fc_s32f_power_spectrum_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
40 #ifndef INCLUDED_volk_32fc_s32f_power_spectrum_32f_a_H
41 #define INCLUDED_volk_32fc_s32f_power_spectrum_32f_a_H
42 
43 #include <inttypes.h>
44 #include <math.h>
45 #include <stdio.h>
46 
47 #ifdef LV_HAVE_GENERIC
48 
49 static inline void
51  const lv_32fc_t* complexFFTInput,
52  const float normalizationFactor,
53  unsigned int num_points)
54 {
55  // Calculate the Power of the complex point
56  const float normFactSq = 1.0 / (normalizationFactor * normalizationFactor);
57 
58  // Calculate dBm
59  // 50 ohm load assumption
60  // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
61  // 75 ohm load assumption
62  // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
63 
64  /*
65  * For generic reference, the code below is a volk-optimized
66  * approach that also leverages a faster log2 calculation
67  * to calculate the log10:
68  * n*log10(x) = n*log2(x)/log2(10) = (n/log2(10)) * log2(x)
69  *
70  * Generic code:
71  *
72  * const float real = *inputPtr++ * iNormalizationFactor;
73  * const float imag = *inputPtr++ * iNormalizationFactor;
74  * realFFTDataPointsPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20);
75  * realFFTDataPointsPtr++;
76  *
77  */
78 
79  // Calc mag^2
80  volk_32fc_magnitude_squared_32f(logPowerOutput, complexFFTInput, num_points);
81 
82  // Finish ((real * real) + (imag * imag)) calculation:
83  volk_32f_s32f_multiply_32f(logPowerOutput, logPowerOutput, normFactSq, num_points);
84 
85  // The following calculates 10*log10(x) = 10*log2(x)/log2(10) = (10/log2(10))
86  // * log2(x)
87  volk_32f_log2_32f(logPowerOutput, logPowerOutput, num_points);
88  volk_32f_s32f_multiply_32f(
89  logPowerOutput, logPowerOutput, volk_log2to10factor, num_points);
90 }
91 #endif /* LV_HAVE_GENERIC */
92 
93 #ifdef LV_HAVE_NEON
94 #include <arm_neon.h>
96 
97 static inline void
99  const lv_32fc_t* complexFFTInput,
100  const float normalizationFactor,
101  unsigned int num_points)
102 {
103  float* logPowerOutputPtr = logPowerOutput;
104  const lv_32fc_t* complexFFTInputPtr = complexFFTInput;
105  const float iNormalizationFactor = 1.0 / normalizationFactor;
106  unsigned int number;
107  unsigned int quarter_points = num_points / 4;
108  float32x4x2_t fft_vec;
109  float32x4_t log_pwr_vec;
110  float32x4_t mag_squared_vec;
111 
112  const float inv_ln10_10 = 4.34294481903f; // 10.0/ln(10.)
113 
114  for (number = 0; number < quarter_points; number++) {
115  // Load
116  fft_vec = vld2q_f32((float*)complexFFTInputPtr);
117  // Prefetch next 4
118  __VOLK_PREFETCH(complexFFTInputPtr + 4);
119  // Normalize
120  fft_vec.val[0] = vmulq_n_f32(fft_vec.val[0], iNormalizationFactor);
121  fft_vec.val[1] = vmulq_n_f32(fft_vec.val[1], iNormalizationFactor);
122  mag_squared_vec = _vmagnitudesquaredq_f32(fft_vec);
123  log_pwr_vec = vmulq_n_f32(_vlogq_f32(mag_squared_vec), inv_ln10_10);
124  // Store
125  vst1q_f32(logPowerOutputPtr, log_pwr_vec);
126  // Move pointers ahead
127  complexFFTInputPtr += 4;
128  logPowerOutputPtr += 4;
129  }
130 
131  // deal with the rest
132  for (number = quarter_points * 4; number < num_points; number++) {
133  const float real = lv_creal(*complexFFTInputPtr) * iNormalizationFactor;
134  const float imag = lv_cimag(*complexFFTInputPtr) * iNormalizationFactor;
135 
136  *logPowerOutputPtr =
137  volk_log2to10factor * log2f_non_ieee(((real * real) + (imag * imag)));
138  complexFFTInputPtr++;
139  logPowerOutputPtr++;
140  }
141 }
142 
143 #endif /* LV_HAVE_NEON */
144 
145 
146 #ifdef LV_HAVE_RVV
147 #include <riscv_vector.h>
148 
149 static inline void volk_32fc_s32f_power_spectrum_32f_rvv(float* logPowerOutput,
150  const lv_32fc_t* complexFFTInput,
151  const float normalizationFactor,
152  unsigned int num_points)
153 {
154  size_t vlmax = __riscv_vsetvlmax_e32m2();
155 
156 #if LOG_POLY_DEGREE == 6
157  const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(3.1157899f, vlmax);
158  const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(-3.3241990f, vlmax);
159  const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(2.5988452f, vlmax);
160  const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(-1.2315303f, vlmax);
161  const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(3.1821337e-1f, vlmax);
162  const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(-3.4436006e-2f, vlmax);
163 #elif LOG_POLY_DEGREE == 5
164  const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(2.8882704548164776201f, vlmax);
165  const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(-2.52074962577807006663f, vlmax);
166  const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(1.48116647521213171641f, vlmax);
167  const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(-0.465725644288844778798f, vlmax);
168  const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(0.0596515482674574969533f, vlmax);
169 #elif LOG_POLY_DEGREE == 4
170  const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(2.61761038894603480148f, vlmax);
171  const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(-1.75647175389045657003f, vlmax);
172  const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(0.688243882994381274313f, vlmax);
173  const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(-0.107254423828329604454f, vlmax);
174 #elif LOG_POLY_DEGREE == 3
175  const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(2.28330284476918490682f, vlmax);
176  const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(-1.04913055217340124191f, vlmax);
177  const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(0.204446009836232697516f, vlmax);
178 #else
179 #error
180 #endif
181 
182  const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax);
183  const vint32m2_t m1 = __riscv_vreinterpret_i32m2(cf1);
184  const vint32m2_t m2 = __riscv_vmv_v_x_i32m2(0x7FFFFF, vlmax);
185  const vint32m2_t c127 = __riscv_vmv_v_x_i32m2(127, vlmax);
186 
187  const float normFactSq = 1.0 / (normalizationFactor * normalizationFactor);
188 
189  size_t n = num_points;
190  for (size_t vl; n > 0; n -= vl, complexFFTInput += vl, logPowerOutput += vl) {
191  vl = __riscv_vsetvl_e32m2(n);
192  vuint64m4_t vc = __riscv_vle64_v_u64m4((const uint64_t*)complexFFTInput, vl);
193  vfloat32m2_t vr = __riscv_vreinterpret_f32m2(__riscv_vnsrl(vc, 0, vl));
194  vfloat32m2_t vi = __riscv_vreinterpret_f32m2(__riscv_vnsrl(vc, 32, vl));
195  vfloat32m2_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl);
196  v = __riscv_vfmul(v, normFactSq, vl);
197 
198  vfloat32m2_t a = __riscv_vfabs(v, vl);
199  vfloat32m2_t exp = __riscv_vfcvt_f(
200  __riscv_vsub(__riscv_vsra(__riscv_vreinterpret_i32m2(a), 23, vl), c127, vl),
201  vl);
202  vfloat32m2_t frac = __riscv_vreinterpret_f32m2(
203  __riscv_vor(__riscv_vand(__riscv_vreinterpret_i32m2(v), m2, vl), m1, vl));
204 
205  vfloat32m2_t mant = c0;
206  mant = __riscv_vfmadd(mant, frac, c1, vl);
207  mant = __riscv_vfmadd(mant, frac, c2, vl);
208 #if LOG_POLY_DEGREE >= 4
209  mant = __riscv_vfmadd(mant, frac, c3, vl);
210 #if LOG_POLY_DEGREE >= 5
211  mant = __riscv_vfmadd(mant, frac, c4, vl);
212 #if LOG_POLY_DEGREE >= 6
213  mant = __riscv_vfmadd(mant, frac, c5, vl);
214 #endif
215 #endif
216 #endif
217  v = __riscv_vfmacc(exp, mant, __riscv_vfsub(frac, cf1, vl), vl);
218  v = __riscv_vfmul(v, volk_log2to10factor, vl);
219 
220  __riscv_vse32(logPowerOutput, v, vl);
221  }
222 }
223 #endif /*LV_HAVE_RVV*/
224 
225 
226 #ifdef LV_HAVE_RVVSEG
227 #include <riscv_vector.h>
228 
229 static inline void
230 volk_32fc_s32f_power_spectrum_32f_rvvseg(float* logPowerOutput,
231  const lv_32fc_t* complexFFTInput,
232  const float normalizationFactor,
233  unsigned int num_points)
234 {
235  size_t vlmax = __riscv_vsetvlmax_e32m2();
236 
237 #if LOG_POLY_DEGREE == 6
238  const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(3.1157899f, vlmax);
239  const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(-3.3241990f, vlmax);
240  const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(2.5988452f, vlmax);
241  const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(-1.2315303f, vlmax);
242  const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(3.1821337e-1f, vlmax);
243  const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(-3.4436006e-2f, vlmax);
244 #elif LOG_POLY_DEGREE == 5
245  const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(2.8882704548164776201f, vlmax);
246  const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(-2.52074962577807006663f, vlmax);
247  const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(1.48116647521213171641f, vlmax);
248  const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(-0.465725644288844778798f, vlmax);
249  const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(0.0596515482674574969533f, vlmax);
250 #elif LOG_POLY_DEGREE == 4
251  const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(2.61761038894603480148f, vlmax);
252  const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(-1.75647175389045657003f, vlmax);
253  const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(0.688243882994381274313f, vlmax);
254  const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(-0.107254423828329604454f, vlmax);
255 #elif LOG_POLY_DEGREE == 3
256  const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(2.28330284476918490682f, vlmax);
257  const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(-1.04913055217340124191f, vlmax);
258  const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(0.204446009836232697516f, vlmax);
259 #else
260 #error
261 #endif
262 
263  const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax);
264  const vint32m2_t m1 = __riscv_vreinterpret_i32m2(cf1);
265  const vint32m2_t m2 = __riscv_vmv_v_x_i32m2(0x7FFFFF, vlmax);
266  const vint32m2_t c127 = __riscv_vmv_v_x_i32m2(127, vlmax);
267 
268  const float normFactSq = 1.0 / (normalizationFactor * normalizationFactor);
269 
270  size_t n = num_points;
271  for (size_t vl; n > 0; n -= vl, complexFFTInput += vl, logPowerOutput += vl) {
272  vl = __riscv_vsetvl_e32m2(n);
273  vfloat32m2x2_t vc =
274  __riscv_vlseg2e32_v_f32m2x2((const float*)complexFFTInput, vl);
275  vfloat32m2_t vr = __riscv_vget_f32m2(vc, 0);
276  vfloat32m2_t vi = __riscv_vget_f32m2(vc, 1);
277  vfloat32m2_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl);
278  v = __riscv_vfmul(v, normFactSq, vl);
279 
280  vfloat32m2_t a = __riscv_vfabs(v, vl);
281  vfloat32m2_t exp = __riscv_vfcvt_f(
282  __riscv_vsub(__riscv_vsra(__riscv_vreinterpret_i32m2(a), 23, vl), c127, vl),
283  vl);
284  vfloat32m2_t frac = __riscv_vreinterpret_f32m2(
285  __riscv_vor(__riscv_vand(__riscv_vreinterpret_i32m2(v), m2, vl), m1, vl));
286 
287  vfloat32m2_t mant = c0;
288  mant = __riscv_vfmadd(mant, frac, c1, vl);
289  mant = __riscv_vfmadd(mant, frac, c2, vl);
290 #if LOG_POLY_DEGREE >= 4
291  mant = __riscv_vfmadd(mant, frac, c3, vl);
292 #if LOG_POLY_DEGREE >= 5
293  mant = __riscv_vfmadd(mant, frac, c4, vl);
294 #if LOG_POLY_DEGREE >= 6
295  mant = __riscv_vfmadd(mant, frac, c5, vl);
296 #endif
297 #endif
298 #endif
299  v = __riscv_vfmacc(exp, mant, __riscv_vfsub(frac, cf1, vl), vl);
300  v = __riscv_vfmul(v, volk_log2to10factor, vl);
301 
302  __riscv_vse32(logPowerOutput, v, vl);
303  }
304 }
305 
306 #endif /*LV_HAVE_RVVSEG*/
307 
308 #endif /* INCLUDED_volk_32fc_s32f_power_spectrum_32f_a_H */
static void volk_32fc_s32f_power_spectrum_32f_generic(float *logPowerOutput, const lv_32fc_t *complexFFTInput, const float normalizationFactor, unsigned int num_points)
Definition: volk_32fc_s32f_power_spectrum_32f.h:50
static void volk_32fc_s32f_power_spectrum_32f_neon(float *logPowerOutput, const lv_32fc_t *complexFFTInput, const float normalizationFactor, unsigned int num_points)
Definition: volk_32fc_s32f_power_spectrum_32f.h:98
#define volk_log2to10factor
Definition: volk_common.h:165
static float log2f_non_ieee(float f)
Definition: volk_common.h:155
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
#define lv_cimag(x)
Definition: volk_complex.h:98
#define lv_creal(x)
Definition: volk_complex.h:96
float complex lv_32fc_t
Definition: volk_complex.h:74
static float32x4_t _vlogq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:143
static float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
Definition: volk_neon_intrinsics.h:73