Vector Optimized Library of Kernels  3.1.2
Architecture-tuned implementations of math kernels
volk_32fc_magnitude_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
58 #ifndef INCLUDED_volk_32fc_magnitude_32f_u_H
59 #define INCLUDED_volk_32fc_magnitude_32f_u_H
60 
61 #include <inttypes.h>
62 #include <math.h>
63 #include <stdio.h>
64 
65 #ifdef LV_HAVE_AVX
66 #include <immintrin.h>
68 
69 static inline void volk_32fc_magnitude_32f_u_avx(float* magnitudeVector,
70  const lv_32fc_t* complexVector,
71  unsigned int num_points)
72 {
73  unsigned int number = 0;
74  const unsigned int eighthPoints = num_points / 8;
75 
76  const float* complexVectorPtr = (float*)complexVector;
77  float* magnitudeVectorPtr = magnitudeVector;
78 
79  __m256 cplxValue1, cplxValue2, result;
80 
81  for (; number < eighthPoints; number++) {
82  cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
83  cplxValue2 = _mm256_loadu_ps(complexVectorPtr + 8);
84  result = _mm256_magnitude_ps(cplxValue1, cplxValue2);
85  _mm256_storeu_ps(magnitudeVectorPtr, result);
86 
87  complexVectorPtr += 16;
88  magnitudeVectorPtr += 8;
89  }
90 
91  number = eighthPoints * 8;
92  for (; number < num_points; number++) {
93  float val1Real = *complexVectorPtr++;
94  float val1Imag = *complexVectorPtr++;
95  *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
96  }
97 }
98 #endif /* LV_HAVE_AVX */
99 
100 #ifdef LV_HAVE_SSE3
101 #include <pmmintrin.h>
103 
104 static inline void volk_32fc_magnitude_32f_u_sse3(float* magnitudeVector,
105  const lv_32fc_t* complexVector,
106  unsigned int num_points)
107 {
108  unsigned int number = 0;
109  const unsigned int quarterPoints = num_points / 4;
110 
111  const float* complexVectorPtr = (float*)complexVector;
112  float* magnitudeVectorPtr = magnitudeVector;
113 
114  __m128 cplxValue1, cplxValue2, result;
115  for (; number < quarterPoints; number++) {
116  cplxValue1 = _mm_loadu_ps(complexVectorPtr);
117  complexVectorPtr += 4;
118 
119  cplxValue2 = _mm_loadu_ps(complexVectorPtr);
120  complexVectorPtr += 4;
121 
122  result = _mm_magnitude_ps_sse3(cplxValue1, cplxValue2);
123 
124  _mm_storeu_ps(magnitudeVectorPtr, result);
125  magnitudeVectorPtr += 4;
126  }
127 
128  number = quarterPoints * 4;
129  for (; number < num_points; number++) {
130  float val1Real = *complexVectorPtr++;
131  float val1Imag = *complexVectorPtr++;
132  *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
133  }
134 }
135 #endif /* LV_HAVE_SSE3 */
136 
137 
138 #ifdef LV_HAVE_SSE
140 #include <xmmintrin.h>
141 
142 static inline void volk_32fc_magnitude_32f_u_sse(float* magnitudeVector,
143  const lv_32fc_t* complexVector,
144  unsigned int num_points)
145 {
146  unsigned int number = 0;
147  const unsigned int quarterPoints = num_points / 4;
148 
149  const float* complexVectorPtr = (float*)complexVector;
150  float* magnitudeVectorPtr = magnitudeVector;
151 
152  __m128 cplxValue1, cplxValue2, result;
153 
154  for (; number < quarterPoints; number++) {
155  cplxValue1 = _mm_loadu_ps(complexVectorPtr);
156  complexVectorPtr += 4;
157 
158  cplxValue2 = _mm_loadu_ps(complexVectorPtr);
159  complexVectorPtr += 4;
160 
161  result = _mm_magnitude_ps(cplxValue1, cplxValue2);
162  _mm_storeu_ps(magnitudeVectorPtr, result);
163  magnitudeVectorPtr += 4;
164  }
165 
166  number = quarterPoints * 4;
167  for (; number < num_points; number++) {
168  float val1Real = *complexVectorPtr++;
169  float val1Imag = *complexVectorPtr++;
170  *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
171  }
172 }
173 #endif /* LV_HAVE_SSE */
174 
175 
176 #ifdef LV_HAVE_GENERIC
177 
178 static inline void volk_32fc_magnitude_32f_generic(float* magnitudeVector,
179  const lv_32fc_t* complexVector,
180  unsigned int num_points)
181 {
182  const float* complexVectorPtr = (float*)complexVector;
183  float* magnitudeVectorPtr = magnitudeVector;
184  unsigned int number = 0;
185  for (number = 0; number < num_points; number++) {
186  const float real = *complexVectorPtr++;
187  const float imag = *complexVectorPtr++;
188  *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag));
189  }
190 }
191 #endif /* LV_HAVE_GENERIC */
192 
193 
194 #endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */
195 #ifndef INCLUDED_volk_32fc_magnitude_32f_a_H
196 #define INCLUDED_volk_32fc_magnitude_32f_a_H
197 
198 #include <inttypes.h>
199 #include <math.h>
200 #include <stdio.h>
201 
202 #ifdef LV_HAVE_AVX
203 #include <immintrin.h>
205 
206 static inline void volk_32fc_magnitude_32f_a_avx(float* magnitudeVector,
207  const lv_32fc_t* complexVector,
208  unsigned int num_points)
209 {
210  unsigned int number = 0;
211  const unsigned int eighthPoints = num_points / 8;
212 
213  const float* complexVectorPtr = (float*)complexVector;
214  float* magnitudeVectorPtr = magnitudeVector;
215 
216  __m256 cplxValue1, cplxValue2, result;
217  for (; number < eighthPoints; number++) {
218  cplxValue1 = _mm256_load_ps(complexVectorPtr);
219  complexVectorPtr += 8;
220 
221  cplxValue2 = _mm256_load_ps(complexVectorPtr);
222  complexVectorPtr += 8;
223 
224  result = _mm256_magnitude_ps(cplxValue1, cplxValue2);
225  _mm256_store_ps(magnitudeVectorPtr, result);
226  magnitudeVectorPtr += 8;
227  }
228 
229  number = eighthPoints * 8;
230  for (; number < num_points; number++) {
231  float val1Real = *complexVectorPtr++;
232  float val1Imag = *complexVectorPtr++;
233  *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
234  }
235 }
236 #endif /* LV_HAVE_AVX */
237 
238 #ifdef LV_HAVE_SSE3
239 #include <pmmintrin.h>
241 
242 static inline void volk_32fc_magnitude_32f_a_sse3(float* magnitudeVector,
243  const lv_32fc_t* complexVector,
244  unsigned int num_points)
245 {
246  unsigned int number = 0;
247  const unsigned int quarterPoints = num_points / 4;
248 
249  const float* complexVectorPtr = (float*)complexVector;
250  float* magnitudeVectorPtr = magnitudeVector;
251 
252  __m128 cplxValue1, cplxValue2, result;
253  for (; number < quarterPoints; number++) {
254  cplxValue1 = _mm_load_ps(complexVectorPtr);
255  complexVectorPtr += 4;
256 
257  cplxValue2 = _mm_load_ps(complexVectorPtr);
258  complexVectorPtr += 4;
259 
260  result = _mm_magnitude_ps_sse3(cplxValue1, cplxValue2);
261  _mm_store_ps(magnitudeVectorPtr, result);
262  magnitudeVectorPtr += 4;
263  }
264 
265  number = quarterPoints * 4;
266  for (; number < num_points; number++) {
267  float val1Real = *complexVectorPtr++;
268  float val1Imag = *complexVectorPtr++;
269  *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
270  }
271 }
272 #endif /* LV_HAVE_SSE3 */
273 
274 #ifdef LV_HAVE_SSE
276 #include <xmmintrin.h>
277 
278 static inline void volk_32fc_magnitude_32f_a_sse(float* magnitudeVector,
279  const lv_32fc_t* complexVector,
280  unsigned int num_points)
281 {
282  unsigned int number = 0;
283  const unsigned int quarterPoints = num_points / 4;
284 
285  const float* complexVectorPtr = (float*)complexVector;
286  float* magnitudeVectorPtr = magnitudeVector;
287 
288  __m128 cplxValue1, cplxValue2, result;
289  for (; number < quarterPoints; number++) {
290  cplxValue1 = _mm_load_ps(complexVectorPtr);
291  complexVectorPtr += 4;
292 
293  cplxValue2 = _mm_load_ps(complexVectorPtr);
294  complexVectorPtr += 4;
295 
296  result = _mm_magnitude_ps(cplxValue1, cplxValue2);
297  _mm_store_ps(magnitudeVectorPtr, result);
298  magnitudeVectorPtr += 4;
299  }
300 
301  number = quarterPoints * 4;
302  for (; number < num_points; number++) {
303  float val1Real = *complexVectorPtr++;
304  float val1Imag = *complexVectorPtr++;
305  *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
306  }
307 }
308 #endif /* LV_HAVE_SSE */
309 
310 
311 #ifdef LV_HAVE_NEON
312 #include <arm_neon.h>
313 
314 static inline void volk_32fc_magnitude_32f_neon(float* magnitudeVector,
315  const lv_32fc_t* complexVector,
316  unsigned int num_points)
317 {
318  unsigned int number;
319  unsigned int quarter_points = num_points / 4;
320  const float* complexVectorPtr = (float*)complexVector;
321  float* magnitudeVectorPtr = magnitudeVector;
322 
323  float32x4x2_t complex_vec;
324  float32x4_t magnitude_vec;
325  for (number = 0; number < quarter_points; number++) {
326  complex_vec = vld2q_f32(complexVectorPtr);
327  complex_vec.val[0] = vmulq_f32(complex_vec.val[0], complex_vec.val[0]);
328  magnitude_vec =
329  vmlaq_f32(complex_vec.val[0], complex_vec.val[1], complex_vec.val[1]);
330  magnitude_vec = vrsqrteq_f32(magnitude_vec);
331  magnitude_vec = vrecpeq_f32(magnitude_vec); // no plain ol' sqrt
332  vst1q_f32(magnitudeVectorPtr, magnitude_vec);
333 
334  complexVectorPtr += 8;
335  magnitudeVectorPtr += 4;
336  }
337 
338  for (number = quarter_points * 4; number < num_points; number++) {
339  const float real = *complexVectorPtr++;
340  const float imag = *complexVectorPtr++;
341  *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag));
342  }
343 }
344 #endif /* LV_HAVE_NEON */
345 
346 
347 #ifdef LV_HAVE_NEON
365  float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points)
366 {
367  unsigned int number;
368  unsigned int quarter_points = num_points / 4;
369  const float* complexVectorPtr = (float*)complexVector;
370  float* magnitudeVectorPtr = magnitudeVector;
371 
372  const float threshold = 0.4142135;
373 
374  float32x4_t a_vec, b_vec, a_high, a_low, b_high, b_low;
375  a_high = vdupq_n_f32(0.84);
376  b_high = vdupq_n_f32(0.561);
377  a_low = vdupq_n_f32(0.99);
378  b_low = vdupq_n_f32(0.197);
379 
380  uint32x4_t comp0, comp1;
381 
382  float32x4x2_t complex_vec;
383  float32x4_t min_vec, max_vec, magnitude_vec;
384  float32x4_t real_abs, imag_abs;
385  for (number = 0; number < quarter_points; number++) {
386  complex_vec = vld2q_f32(complexVectorPtr);
387 
388  real_abs = vabsq_f32(complex_vec.val[0]);
389  imag_abs = vabsq_f32(complex_vec.val[1]);
390 
391  min_vec = vminq_f32(real_abs, imag_abs);
392  max_vec = vmaxq_f32(real_abs, imag_abs);
393 
394  // effective branch to choose coefficient pair.
395  comp0 = vcgtq_f32(min_vec, vmulq_n_f32(max_vec, threshold));
396  comp1 = vcleq_f32(min_vec, vmulq_n_f32(max_vec, threshold));
397 
398  // and 0s or 1s with coefficients from previous effective branch
399  a_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0, (int32x4_t)a_high),
400  vandq_s32((int32x4_t)comp1, (int32x4_t)a_low));
401  b_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0, (int32x4_t)b_high),
402  vandq_s32((int32x4_t)comp1, (int32x4_t)b_low));
403 
404  // coefficients chosen, do the weighted sum
405  min_vec = vmulq_f32(min_vec, b_vec);
406  max_vec = vmulq_f32(max_vec, a_vec);
407 
408  magnitude_vec = vaddq_f32(min_vec, max_vec);
409  vst1q_f32(magnitudeVectorPtr, magnitude_vec);
410 
411  complexVectorPtr += 8;
412  magnitudeVectorPtr += 4;
413  }
414 
415  for (number = quarter_points * 4; number < num_points; number++) {
416  const float real = *complexVectorPtr++;
417  const float imag = *complexVectorPtr++;
418  *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag));
419  }
420 }
421 #endif /* LV_HAVE_NEON */
422 
423 
424 #endif /* INCLUDED_volk_32fc_magnitude_32f_a_H */
static void volk_32fc_magnitude_32f_u_sse(float *magnitudeVector, const lv_32fc_t *complexVector, unsigned int num_points)
Definition: volk_32fc_magnitude_32f.h:142
static void volk_32fc_magnitude_32f_u_sse3(float *magnitudeVector, const lv_32fc_t *complexVector, unsigned int num_points)
Definition: volk_32fc_magnitude_32f.h:104
static void volk_32fc_magnitude_32f_u_avx(float *magnitudeVector, const lv_32fc_t *complexVector, unsigned int num_points)
Definition: volk_32fc_magnitude_32f.h:69
static void volk_32fc_magnitude_32f_neon_fancy_sweet(float *magnitudeVector, const lv_32fc_t *complexVector, unsigned int num_points)
Calculates the magnitude of the complexVector and stores the results in the magnitudeVector.
Definition: volk_32fc_magnitude_32f.h:364
static void volk_32fc_magnitude_32f_generic(float *magnitudeVector, const lv_32fc_t *complexVector, unsigned int num_points)
Definition: volk_32fc_magnitude_32f.h:178
static void volk_32fc_magnitude_32f_a_avx(float *magnitudeVector, const lv_32fc_t *complexVector, unsigned int num_points)
Definition: volk_32fc_magnitude_32f.h:206
static void volk_32fc_magnitude_32f_neon(float *magnitudeVector, const lv_32fc_t *complexVector, unsigned int num_points)
Definition: volk_32fc_magnitude_32f.h:314
static void volk_32fc_magnitude_32f_a_sse3(float *magnitudeVector, const lv_32fc_t *complexVector, unsigned int num_points)
Definition: volk_32fc_magnitude_32f.h:242
static void volk_32fc_magnitude_32f_a_sse(float *magnitudeVector, const lv_32fc_t *complexVector, unsigned int num_points)
Definition: volk_32fc_magnitude_32f.h:278
static __m256 _mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2)
Definition: volk_avx_intrinsics.h:108
float complex lv_32fc_t
Definition: volk_complex.h:74
static __m128 _mm_magnitude_ps_sse3(__m128 cplxValue1, __m128 cplxValue2)
Definition: volk_sse3_intrinsics.h:45
static __m128 _mm_magnitude_ps(__m128 cplxValue1, __m128 cplxValue2)
Definition: volk_sse_intrinsics.h:69