Vector Optimized Library of Kernels  3.2.0
Architecture-tuned implementations of math kernels
volk_32fc_s32f_atan2_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
5  *
6  * This file is part of VOLK
7  *
8  * SPDX-License-Identifier: LGPL-3.0-or-later
9  */
10 
61 #ifndef INCLUDED_volk_32fc_s32f_atan2_32f_a_H
62 #define INCLUDED_volk_32fc_s32f_atan2_32f_a_H
63 
64 #include <math.h>
65 
66 #ifdef LV_HAVE_GENERIC
67 static inline void volk_32fc_s32f_atan2_32f_generic(float* outputVector,
68  const lv_32fc_t* inputVector,
69  const float normalizeFactor,
70  unsigned int num_points)
71 {
72  float* outPtr = outputVector;
73  const float* inPtr = (float*)inputVector;
74  const float invNormalizeFactor = 1.f / normalizeFactor;
75  unsigned int number = 0;
76  for (; number < num_points; number++) {
77  const float real = *inPtr++;
78  const float imag = *inPtr++;
79  *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
80  }
81 }
82 #endif /* LV_HAVE_GENERIC */
83 
84 #ifdef LV_HAVE_GENERIC
85 #include <volk/volk_common.h>
86 static inline void volk_32fc_s32f_atan2_32f_polynomial(float* outputVector,
87  const lv_32fc_t* inputVector,
88  const float normalizeFactor,
89  unsigned int num_points)
90 {
91  float* outPtr = outputVector;
92  const float* inPtr = (float*)inputVector;
93  const float invNormalizeFactor = 1.f / normalizeFactor;
94  unsigned int number = 0;
95  for (; number < num_points; number++) {
96  const float x = *inPtr++;
97  const float y = *inPtr++;
98  *outPtr++ = volk_atan2(y, x) * invNormalizeFactor;
99  }
100 }
101 #endif /* LV_HAVE_GENERIC */
102 
103 #if LV_HAVE_AVX2 && LV_HAVE_FMA
104 #include <immintrin.h>
106 static inline void volk_32fc_s32f_atan2_32f_a_avx2_fma(float* outputVector,
107  const lv_32fc_t* complexVector,
108  const float normalizeFactor,
109  unsigned int num_points)
110 {
111  const float* in = (float*)complexVector;
112  float* out = (float*)outputVector;
113 
114  const float invNormalizeFactor = 1.f / normalizeFactor;
115  const __m256 vinvNormalizeFactor = _mm256_set1_ps(invNormalizeFactor);
116  const __m256 pi = _mm256_set1_ps(0x1.921fb6p1f);
117  const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
118  const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
119  const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
120  const __m256 zero = _mm256_setzero_ps();
121 
122  unsigned int number = 0;
123  unsigned int eighth_points = num_points / 8;
124  for (; number < eighth_points; number++) {
125  __m256 z1 = _mm256_load_ps(in);
126  in += 8;
127  __m256 z2 = _mm256_load_ps(in);
128  in += 8;
129 
130  __m256 x = _mm256_real(z1, z2);
131  __m256 y = _mm256_imag(z1, z2);
132 
133  __m256 swap_mask = _mm256_cmp_ps(
134  _mm256_and_ps(y, abs_mask), _mm256_and_ps(x, abs_mask), _CMP_GT_OS);
135  __m256 input = _mm256_div_ps(_mm256_blendv_ps(y, x, swap_mask),
136  _mm256_blendv_ps(x, y, swap_mask));
137  __m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q);
138  input = _mm256_blendv_ps(input, zero, nan_mask);
139  __m256 result = _m256_arctan_poly_avx2_fma(input);
140 
141  input =
142  _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
143  result = _mm256_blendv_ps(result, input, swap_mask);
144 
145  __m256 x_sign_mask =
146  _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31));
147 
148  result = _mm256_add_ps(
149  _mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), x_sign_mask),
150  result);
151  result = _mm256_mul_ps(result, vinvNormalizeFactor);
152 
153  _mm256_store_ps(out, result);
154  out += 8;
155  }
156 
157  number = eighth_points * 8;
159  out, (lv_32fc_t*)in, normalizeFactor, num_points - number);
160 }
161 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
162 
163 #if LV_HAVE_AVX2
164 #include <immintrin.h>
166 static inline void volk_32fc_s32f_atan2_32f_a_avx2(float* outputVector,
167  const lv_32fc_t* complexVector,
168  const float normalizeFactor,
169  unsigned int num_points)
170 {
171  const float* in = (float*)complexVector;
172  float* out = (float*)outputVector;
173 
174  const float invNormalizeFactor = 1.f / normalizeFactor;
175  const __m256 vinvNormalizeFactor = _mm256_set1_ps(invNormalizeFactor);
176  const __m256 pi = _mm256_set1_ps(0x1.921fb6p1f);
177  const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
178  const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
179  const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
180  const __m256 zero = _mm256_setzero_ps();
181 
182  unsigned int number = 0;
183  unsigned int eighth_points = num_points / 8;
184  for (; number < eighth_points; number++) {
185  __m256 z1 = _mm256_load_ps(in);
186  in += 8;
187  __m256 z2 = _mm256_load_ps(in);
188  in += 8;
189 
190  __m256 x = _mm256_real(z1, z2);
191  __m256 y = _mm256_imag(z1, z2);
192 
193  __m256 swap_mask = _mm256_cmp_ps(
194  _mm256_and_ps(y, abs_mask), _mm256_and_ps(x, abs_mask), _CMP_GT_OS);
195  __m256 input = _mm256_div_ps(_mm256_blendv_ps(y, x, swap_mask),
196  _mm256_blendv_ps(x, y, swap_mask));
197  __m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q);
198  input = _mm256_blendv_ps(input, zero, nan_mask);
199  __m256 result = _m256_arctan_poly_avx(input);
200 
201  input =
202  _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
203  result = _mm256_blendv_ps(result, input, swap_mask);
204 
205  __m256 x_sign_mask =
206  _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31));
207 
208  result = _mm256_add_ps(
209  _mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), x_sign_mask),
210  result);
211  result = _mm256_mul_ps(result, vinvNormalizeFactor);
212 
213  _mm256_store_ps(out, result);
214  out += 8;
215  }
216 
217  number = eighth_points * 8;
219  out, (lv_32fc_t*)in, normalizeFactor, num_points - number);
220 }
221 #endif /* LV_HAVE_AVX2 for aligned */
222 #endif /* INCLUDED_volk_32fc_s32f_atan2_32f_a_H */
223 
224 #ifndef INCLUDED_volk_32fc_s32f_atan2_32f_u_H
225 #define INCLUDED_volk_32fc_s32f_atan2_32f_u_H
226 
227 #if LV_HAVE_AVX2 && LV_HAVE_FMA
228 #include <immintrin.h>
230 static inline void volk_32fc_s32f_atan2_32f_u_avx2_fma(float* outputVector,
231  const lv_32fc_t* complexVector,
232  const float normalizeFactor,
233  unsigned int num_points)
234 {
235  const float* in = (float*)complexVector;
236  float* out = (float*)outputVector;
237 
238  const float invNormalizeFactor = 1.f / normalizeFactor;
239  const __m256 vinvNormalizeFactor = _mm256_set1_ps(invNormalizeFactor);
240  const __m256 pi = _mm256_set1_ps(0x1.921fb6p1f);
241  const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
242  const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
243  const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
244  const __m256 zero = _mm256_setzero_ps();
245 
246  unsigned int number = 0;
247  unsigned int eighth_points = num_points / 8;
248  for (; number < eighth_points; number++) {
249  __m256 z1 = _mm256_loadu_ps(in);
250  in += 8;
251  __m256 z2 = _mm256_loadu_ps(in);
252  in += 8;
253 
254  __m256 x = _mm256_real(z1, z2);
255  __m256 y = _mm256_imag(z1, z2);
256 
257  __m256 swap_mask = _mm256_cmp_ps(
258  _mm256_and_ps(y, abs_mask), _mm256_and_ps(x, abs_mask), _CMP_GT_OS);
259  __m256 input = _mm256_div_ps(_mm256_blendv_ps(y, x, swap_mask),
260  _mm256_blendv_ps(x, y, swap_mask));
261  __m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q);
262  input = _mm256_blendv_ps(input, zero, nan_mask);
263  __m256 result = _m256_arctan_poly_avx2_fma(input);
264 
265  input =
266  _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
267  result = _mm256_blendv_ps(result, input, swap_mask);
268 
269  __m256 x_sign_mask =
270  _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31));
271 
272  result = _mm256_add_ps(
273  _mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), x_sign_mask),
274  result);
275  result = _mm256_mul_ps(result, vinvNormalizeFactor);
276 
277  _mm256_storeu_ps(out, result);
278  out += 8;
279  }
280 
281  number = eighth_points * 8;
283  out, (lv_32fc_t*)in, normalizeFactor, num_points - number);
284 }
285 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
286 
287 #if LV_HAVE_AVX2
288 #include <immintrin.h>
290 static inline void volk_32fc_s32f_atan2_32f_u_avx2(float* outputVector,
291  const lv_32fc_t* complexVector,
292  const float normalizeFactor,
293  unsigned int num_points)
294 {
295  const float* in = (float*)complexVector;
296  float* out = (float*)outputVector;
297 
298  const float invNormalizeFactor = 1.f / normalizeFactor;
299  const __m256 vinvNormalizeFactor = _mm256_set1_ps(invNormalizeFactor);
300  const __m256 pi = _mm256_set1_ps(0x1.921fb6p1f);
301  const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
302  const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
303  const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
304  const __m256 zero = _mm256_setzero_ps();
305 
306  unsigned int number = 0;
307  unsigned int eighth_points = num_points / 8;
308  for (; number < eighth_points; number++) {
309  __m256 z1 = _mm256_loadu_ps(in);
310  in += 8;
311  __m256 z2 = _mm256_loadu_ps(in);
312  in += 8;
313 
314  __m256 x = _mm256_real(z1, z2);
315  __m256 y = _mm256_imag(z1, z2);
316 
317  __m256 swap_mask = _mm256_cmp_ps(
318  _mm256_and_ps(y, abs_mask), _mm256_and_ps(x, abs_mask), _CMP_GT_OS);
319  __m256 input = _mm256_div_ps(_mm256_blendv_ps(y, x, swap_mask),
320  _mm256_blendv_ps(x, y, swap_mask));
321  __m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q);
322  input = _mm256_blendv_ps(input, zero, nan_mask);
323  __m256 result = _m256_arctan_poly_avx(input);
324 
325  input =
326  _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
327  result = _mm256_blendv_ps(result, input, swap_mask);
328 
329  __m256 x_sign_mask =
330  _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31));
331 
332  result = _mm256_add_ps(
333  _mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), x_sign_mask),
334  result);
335  result = _mm256_mul_ps(result, vinvNormalizeFactor);
336 
337  _mm256_storeu_ps(out, result);
338  out += 8;
339  }
340 
341  number = eighth_points * 8;
343  out, (lv_32fc_t*)in, normalizeFactor, num_points - number);
344 }
345 #endif /* LV_HAVE_AVX2 for unaligned */
346 
347 #ifdef LV_HAVE_RVV
348 #include <riscv_vector.h>
350 
351 static inline void volk_32fc_s32f_atan2_32f_rvv(float* outputVector,
352  const lv_32fc_t* inputVector,
353  const float normalizeFactor,
354  unsigned int num_points)
355 {
356  size_t vlmax = __riscv_vsetvlmax_e32m2();
357 
358  const vfloat32m2_t norm = __riscv_vfmv_v_f_f32m2(1 / normalizeFactor, vlmax);
359  const vfloat32m2_t cpi = __riscv_vfmv_v_f_f32m2(3.1415927f, vlmax);
360  const vfloat32m2_t cpio2 = __riscv_vfmv_v_f_f32m2(1.5707964f, vlmax);
361  const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(+0x1.ffffeap-1f, vlmax);
362  const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(-0x1.55437p-2f, vlmax);
363  const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(+0x1.972be6p-3f, vlmax);
364  const vfloat32m2_t c7 = __riscv_vfmv_v_f_f32m2(-0x1.1436ap-3f, vlmax);
365  const vfloat32m2_t c9 = __riscv_vfmv_v_f_f32m2(+0x1.5785aap-4f, vlmax);
366  const vfloat32m2_t c11 = __riscv_vfmv_v_f_f32m2(-0x1.2f3004p-5f, vlmax);
367  const vfloat32m2_t c13 = __riscv_vfmv_v_f_f32m2(+0x1.01a37cp-7f, vlmax);
368 
369  size_t n = num_points;
370  for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
371  vl = __riscv_vsetvl_e32m2(n);
372  vuint64m4_t v = __riscv_vle64_v_u64m4((const uint64_t*)inputVector, vl);
373  vfloat32m2_t vr = __riscv_vreinterpret_f32m2(__riscv_vnsrl(v, 0, vl));
374  vfloat32m2_t vi = __riscv_vreinterpret_f32m2(__riscv_vnsrl(v, 32, vl));
375  vbool16_t mswap = __riscv_vmfgt(__riscv_vfabs(vi, vl), __riscv_vfabs(vr, vl), vl);
376  vfloat32m2_t x = __riscv_vfdiv(
377  __riscv_vmerge(vi, vr, mswap, vl), __riscv_vmerge(vr, vi, mswap, vl), vl);
378  vbool16_t mnan = __riscv_vmsgtu(__riscv_vfclass(x, vl), 0xFF, vl);
379  x = __riscv_vreinterpret_f32m2(
380  __riscv_vmerge(__riscv_vreinterpret_u32m2(x), 0, mnan, vl));
381 
382  vfloat32m2_t xx = __riscv_vfmul(x, x, vl);
383  vfloat32m2_t p = c13;
384  p = __riscv_vfmadd(p, xx, c11, vl);
385  p = __riscv_vfmadd(p, xx, c9, vl);
386  p = __riscv_vfmadd(p, xx, c7, vl);
387  p = __riscv_vfmadd(p, xx, c5, vl);
388  p = __riscv_vfmadd(p, xx, c3, vl);
389  p = __riscv_vfmadd(p, xx, c1, vl);
390  p = __riscv_vfmul(p, x, vl);
391 
392  x = __riscv_vfsub(__riscv_vfsgnj(cpio2, x, vl), p, vl);
393  p = __riscv_vmerge(p, x, mswap, vl);
394  p = __riscv_vfadd_mu(
395  RISCV_VMFLTZ(32m2, vr, vl), p, p, __riscv_vfsgnjx(cpi, vi, vl), vl);
396 
397  __riscv_vse32(outputVector, __riscv_vfmul(p, norm, vl), vl);
398  }
399 }
400 #endif /*LV_HAVE_RVV*/
401 
402 #ifdef LV_HAVE_RVVSEG
403 #include <riscv_vector.h>
405 
406 static inline void volk_32fc_s32f_atan2_32f_rvvseg(float* outputVector,
407  const lv_32fc_t* inputVector,
408  const float normalizeFactor,
409  unsigned int num_points)
410 {
411  size_t vlmax = __riscv_vsetvlmax_e32m2();
412 
413  const vfloat32m2_t norm = __riscv_vfmv_v_f_f32m2(1 / normalizeFactor, vlmax);
414  const vfloat32m2_t cpi = __riscv_vfmv_v_f_f32m2(3.1415927f, vlmax);
415  const vfloat32m2_t cpio2 = __riscv_vfmv_v_f_f32m2(1.5707964f, vlmax);
416  const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(+0x1.ffffeap-1f, vlmax);
417  const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(-0x1.55437p-2f, vlmax);
418  const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(+0x1.972be6p-3f, vlmax);
419  const vfloat32m2_t c7 = __riscv_vfmv_v_f_f32m2(-0x1.1436ap-3f, vlmax);
420  const vfloat32m2_t c9 = __riscv_vfmv_v_f_f32m2(+0x1.5785aap-4f, vlmax);
421  const vfloat32m2_t c11 = __riscv_vfmv_v_f_f32m2(-0x1.2f3004p-5f, vlmax);
422  const vfloat32m2_t c13 = __riscv_vfmv_v_f_f32m2(+0x1.01a37cp-7f, vlmax);
423 
424  size_t n = num_points;
425  for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
426  vl = __riscv_vsetvl_e32m2(n);
427  vfloat32m2x2_t v = __riscv_vlseg2e32_v_f32m2x2((const float*)inputVector, vl);
428  vfloat32m2_t vr = __riscv_vget_f32m2(v, 0), vi = __riscv_vget_f32m2(v, 1);
429  vbool16_t mswap = __riscv_vmfgt(__riscv_vfabs(vi, vl), __riscv_vfabs(vr, vl), vl);
430  vfloat32m2_t x = __riscv_vfdiv(
431  __riscv_vmerge(vi, vr, mswap, vl), __riscv_vmerge(vr, vi, mswap, vl), vl);
432  vbool16_t mnan = __riscv_vmsgtu(__riscv_vfclass(x, vl), 0xFF, vl);
433  x = __riscv_vreinterpret_f32m2(
434  __riscv_vmerge(__riscv_vreinterpret_u32m2(x), 0, mnan, vl));
435 
436  vfloat32m2_t xx = __riscv_vfmul(x, x, vl);
437  vfloat32m2_t p = c13;
438  p = __riscv_vfmadd(p, xx, c11, vl);
439  p = __riscv_vfmadd(p, xx, c9, vl);
440  p = __riscv_vfmadd(p, xx, c7, vl);
441  p = __riscv_vfmadd(p, xx, c5, vl);
442  p = __riscv_vfmadd(p, xx, c3, vl);
443  p = __riscv_vfmadd(p, xx, c1, vl);
444  p = __riscv_vfmul(p, x, vl);
445 
446  x = __riscv_vfsub(__riscv_vfsgnj(cpio2, x, vl), p, vl);
447  p = __riscv_vmerge(p, x, mswap, vl);
448  p = __riscv_vfadd_mu(
449  RISCV_VMFLTZ(32m2, vr, vl), p, p, __riscv_vfsgnjx(cpi, vi, vl), vl);
450 
451  __riscv_vse32(outputVector, __riscv_vfmul(p, norm, vl), vl);
452  }
453 }
454 #endif /*LV_HAVE_RVVSEG*/
455 
456 #endif /* INCLUDED_volk_32fc_s32f_atan2_32f_u_H */
static void volk_32fc_s32f_atan2_32f_polynomial(float *outputVector, const lv_32fc_t *inputVector, const float normalizeFactor, unsigned int num_points)
Definition: volk_32fc_s32f_atan2_32f.h:86
static void volk_32fc_s32f_atan2_32f_generic(float *outputVector, const lv_32fc_t *inputVector, const float normalizeFactor, unsigned int num_points)
Definition: volk_32fc_s32f_atan2_32f.h:67
static __m256 _m256_arctan_poly_avx2_fma(const __m256 x)
Definition: volk_avx2_fma_intrinsics.h:26
static __m256 _mm256_real(const __m256 z1, const __m256 z2)
Definition: volk_avx2_intrinsics.h:21
static __m256 _mm256_imag(const __m256 z1, const __m256 z2)
Definition: volk_avx2_intrinsics.h:28
static __m256 _m256_arctan_poly_avx(const __m256 x)
Definition: volk_avx_intrinsics.h:27
static float volk_atan2(const float y, const float x)
Definition: volk_common.h:215
float complex lv_32fc_t
Definition: volk_complex.h:74
#define RISCV_VMFLTZ(T, v, vl)
Definition: volk_rvv_intrinsics.h:75