Vector Optimized Library of Kernels  3.2.0
Architecture-tuned implementations of math kernels
volk_32f_log2_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
79 #ifndef INCLUDED_volk_32f_log2_32f_a_H
80 #define INCLUDED_volk_32f_log2_32f_a_H
81 
82 #include <inttypes.h>
83 #include <math.h>
84 #include <stdio.h>
85 #include <stdlib.h>
86 
87 #define LOG_POLY_DEGREE 6
88 
89 #ifdef LV_HAVE_GENERIC
90 
91 static inline void
92 volk_32f_log2_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
93 {
94  float* bPtr = bVector;
95  const float* aPtr = aVector;
96  unsigned int number = 0;
97 
98  for (number = 0; number < num_points; number++) {
99  *bPtr++ = log2f_non_ieee(*aPtr++);
100  }
101 }
102 #endif /* LV_HAVE_GENERIC */
103 
104 #if LV_HAVE_AVX2 && LV_HAVE_FMA
105 #include <immintrin.h>
106 
107 #define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0)
108 #define POLY1_FMAAVX2(x, c0, c1) \
109  _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
110 #define POLY2_FMAAVX2(x, c0, c1, c2) \
111  _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
112 #define POLY3_FMAAVX2(x, c0, c1, c2, c3) \
113  _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
114 #define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) \
115  _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
116 #define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) \
117  _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
118 
119 static inline void volk_32f_log2_32f_a_avx2_fma(float* bVector,
120  const float* aVector,
121  unsigned int num_points)
122 {
123  float* bPtr = bVector;
124  const float* aPtr = aVector;
125 
126  unsigned int number = 0;
127  const unsigned int eighthPoints = num_points / 8;
128 
129  __m256 aVal, bVal, mantissa, frac, leadingOne;
130  __m256i bias, exp;
131 
132  for (; number < eighthPoints; number++) {
133 
134  aVal = _mm256_load_ps(aPtr);
135  bias = _mm256_set1_epi32(127);
136  leadingOne = _mm256_set1_ps(1.0f);
137  exp = _mm256_sub_epi32(
138  _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
139  _mm256_set1_epi32(0x7f800000)),
140  23),
141  bias);
142  bVal = _mm256_cvtepi32_ps(exp);
143 
144  // Now to extract mantissa
145  frac = _mm256_or_ps(
146  leadingOne,
147  _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
148 
149 #if LOG_POLY_DEGREE == 6
150  mantissa = POLY5_FMAAVX2(frac,
151  3.1157899f,
152  -3.3241990f,
153  2.5988452f,
154  -1.2315303f,
155  3.1821337e-1f,
156  -3.4436006e-2f);
157 #elif LOG_POLY_DEGREE == 5
158  mantissa = POLY4_FMAAVX2(frac,
159  2.8882704548164776201f,
160  -2.52074962577807006663f,
161  1.48116647521213171641f,
162  -0.465725644288844778798f,
163  0.0596515482674574969533f);
164 #elif LOG_POLY_DEGREE == 4
165  mantissa = POLY3_FMAAVX2(frac,
166  2.61761038894603480148f,
167  -1.75647175389045657003f,
168  0.688243882994381274313f,
169  -0.107254423828329604454f);
170 #elif LOG_POLY_DEGREE == 3
171  mantissa = POLY2_FMAAVX2(frac,
172  2.28330284476918490682f,
173  -1.04913055217340124191f,
174  0.204446009836232697516f);
175 #else
176 #error
177 #endif
178 
179  bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
180  _mm256_store_ps(bPtr, bVal);
181 
182  aPtr += 8;
183  bPtr += 8;
184  }
185 
186  number = eighthPoints * 8;
187  volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
188 }
189 
190 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
191 
192 #ifdef LV_HAVE_AVX2
193 #include <immintrin.h>
194 
195 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
196 #define POLY1_AVX2(x, c0, c1) \
197  _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
198 #define POLY2_AVX2(x, c0, c1, c2) \
199  _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
200 #define POLY3_AVX2(x, c0, c1, c2, c3) \
201  _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
202 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
203  _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
204 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
205  _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
206 
207 static inline void
208 volk_32f_log2_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
209 {
210  float* bPtr = bVector;
211  const float* aPtr = aVector;
212 
213  unsigned int number = 0;
214  const unsigned int eighthPoints = num_points / 8;
215 
216  __m256 aVal, bVal, mantissa, frac, leadingOne;
217  __m256i bias, exp;
218 
219  for (; number < eighthPoints; number++) {
220 
221  aVal = _mm256_load_ps(aPtr);
222  bias = _mm256_set1_epi32(127);
223  leadingOne = _mm256_set1_ps(1.0f);
224  exp = _mm256_sub_epi32(
225  _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
226  _mm256_set1_epi32(0x7f800000)),
227  23),
228  bias);
229  bVal = _mm256_cvtepi32_ps(exp);
230 
231  // Now to extract mantissa
232  frac = _mm256_or_ps(
233  leadingOne,
234  _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
235 
236 #if LOG_POLY_DEGREE == 6
237  mantissa = POLY5_AVX2(frac,
238  3.1157899f,
239  -3.3241990f,
240  2.5988452f,
241  -1.2315303f,
242  3.1821337e-1f,
243  -3.4436006e-2f);
244 #elif LOG_POLY_DEGREE == 5
245  mantissa = POLY4_AVX2(frac,
246  2.8882704548164776201f,
247  -2.52074962577807006663f,
248  1.48116647521213171641f,
249  -0.465725644288844778798f,
250  0.0596515482674574969533f);
251 #elif LOG_POLY_DEGREE == 4
252  mantissa = POLY3_AVX2(frac,
253  2.61761038894603480148f,
254  -1.75647175389045657003f,
255  0.688243882994381274313f,
256  -0.107254423828329604454f);
257 #elif LOG_POLY_DEGREE == 3
258  mantissa = POLY2_AVX2(frac,
259  2.28330284476918490682f,
260  -1.04913055217340124191f,
261  0.204446009836232697516f);
262 #else
263 #error
264 #endif
265 
266  bVal =
267  _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
268  _mm256_store_ps(bPtr, bVal);
269 
270  aPtr += 8;
271  bPtr += 8;
272  }
273 
274  number = eighthPoints * 8;
275  volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
276 }
277 
278 #endif /* LV_HAVE_AVX2 for aligned */
279 
280 #ifdef LV_HAVE_SSE4_1
281 #include <smmintrin.h>
282 
283 #define POLY0(x, c0) _mm_set1_ps(c0)
284 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
285 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
286 #define POLY3(x, c0, c1, c2, c3) \
287  _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
288 #define POLY4(x, c0, c1, c2, c3, c4) \
289  _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
290 #define POLY5(x, c0, c1, c2, c3, c4, c5) \
291  _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
292 
293 static inline void
294 volk_32f_log2_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
295 {
296  float* bPtr = bVector;
297  const float* aPtr = aVector;
298 
299  unsigned int number = 0;
300  const unsigned int quarterPoints = num_points / 4;
301 
302  __m128 aVal, bVal, mantissa, frac, leadingOne;
303  __m128i bias, exp;
304 
305  for (; number < quarterPoints; number++) {
306 
307  aVal = _mm_load_ps(aPtr);
308  bias = _mm_set1_epi32(127);
309  leadingOne = _mm_set1_ps(1.0f);
310  exp = _mm_sub_epi32(
311  _mm_srli_epi32(
312  _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
313  bias);
314  bVal = _mm_cvtepi32_ps(exp);
315 
316  // Now to extract mantissa
317  frac = _mm_or_ps(leadingOne,
318  _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
319 
320 #if LOG_POLY_DEGREE == 6
321  mantissa = POLY5(frac,
322  3.1157899f,
323  -3.3241990f,
324  2.5988452f,
325  -1.2315303f,
326  3.1821337e-1f,
327  -3.4436006e-2f);
328 #elif LOG_POLY_DEGREE == 5
329  mantissa = POLY4(frac,
330  2.8882704548164776201f,
331  -2.52074962577807006663f,
332  1.48116647521213171641f,
333  -0.465725644288844778798f,
334  0.0596515482674574969533f);
335 #elif LOG_POLY_DEGREE == 4
336  mantissa = POLY3(frac,
337  2.61761038894603480148f,
338  -1.75647175389045657003f,
339  0.688243882994381274313f,
340  -0.107254423828329604454f);
341 #elif LOG_POLY_DEGREE == 3
342  mantissa = POLY2(frac,
343  2.28330284476918490682f,
344  -1.04913055217340124191f,
345  0.204446009836232697516f);
346 #else
347 #error
348 #endif
349 
350  bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
351  _mm_store_ps(bPtr, bVal);
352 
353  aPtr += 4;
354  bPtr += 4;
355  }
356 
357  number = quarterPoints * 4;
358  volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
359 }
360 
361 #endif /* LV_HAVE_SSE4_1 for aligned */
362 
363 #ifdef LV_HAVE_NEON
364 #include <arm_neon.h>
365 
366 /* these macros allow us to embed logs in other kernels */
367 #define VLOG2Q_NEON_PREAMBLE() \
368  int32x4_t one = vdupq_n_s32(0x000800000); \
369  /* minimax polynomial */ \
370  float32x4_t p0 = vdupq_n_f32(-3.0400402727048585); \
371  float32x4_t p1 = vdupq_n_f32(6.1129631282966113); \
372  float32x4_t p2 = vdupq_n_f32(-5.3419892024633207); \
373  float32x4_t p3 = vdupq_n_f32(3.2865287703753912); \
374  float32x4_t p4 = vdupq_n_f32(-1.2669182593441635); \
375  float32x4_t p5 = vdupq_n_f32(0.2751487703421256); \
376  float32x4_t p6 = vdupq_n_f32(-0.0256910888150985); \
377  int32x4_t exp_mask = vdupq_n_s32(0x7f800000); \
378  int32x4_t sig_mask = vdupq_n_s32(0x007fffff); \
379  int32x4_t exp_bias = vdupq_n_s32(127);
380 
381 
382 #define VLOG2Q_NEON_F32(log2_approx, aval) \
383  int32x4_t exponent_i = vandq_s32(aval, exp_mask); \
384  int32x4_t significand_i = vandq_s32(aval, sig_mask); \
385  exponent_i = vshrq_n_s32(exponent_i, 23); \
386  \
387  /* extract the exponent and significand \
388  we can treat this as fixed point to save ~9% on the \
389  conversion + float add */ \
390  significand_i = vorrq_s32(one, significand_i); \
391  float32x4_t significand_f = vcvtq_n_f32_s32(significand_i, 23); \
392  /* debias the exponent and convert to float */ \
393  exponent_i = vsubq_s32(exponent_i, exp_bias); \
394  float32x4_t exponent_f = vcvtq_f32_s32(exponent_i); \
395  \
396  /* put the significand through a polynomial fit of log2(x) [1,2] \
397  add the result to the exponent */ \
398  log2_approx = vaddq_f32(exponent_f, p0); /* p0 */ \
399  float32x4_t tmp1 = vmulq_f32(significand_f, p1); /* p1 * x */ \
400  log2_approx = vaddq_f32(log2_approx, tmp1); \
401  float32x4_t sig_2 = vmulq_f32(significand_f, significand_f); /* x^2 */ \
402  tmp1 = vmulq_f32(sig_2, p2); /* p2 * x^2 */ \
403  log2_approx = vaddq_f32(log2_approx, tmp1); \
404  \
405  float32x4_t sig_3 = vmulq_f32(sig_2, significand_f); /* x^3 */ \
406  tmp1 = vmulq_f32(sig_3, p3); /* p3 * x^3 */ \
407  log2_approx = vaddq_f32(log2_approx, tmp1); \
408  float32x4_t sig_4 = vmulq_f32(sig_2, sig_2); /* x^4 */ \
409  tmp1 = vmulq_f32(sig_4, p4); /* p4 * x^4 */ \
410  log2_approx = vaddq_f32(log2_approx, tmp1); \
411  float32x4_t sig_5 = vmulq_f32(sig_3, sig_2); /* x^5 */ \
412  tmp1 = vmulq_f32(sig_5, p5); /* p5 * x^5 */ \
413  log2_approx = vaddq_f32(log2_approx, tmp1); \
414  float32x4_t sig_6 = vmulq_f32(sig_3, sig_3); /* x^6 */ \
415  tmp1 = vmulq_f32(sig_6, p6); /* p6 * x^6 */ \
416  log2_approx = vaddq_f32(log2_approx, tmp1);
417 
418 static inline void
419 volk_32f_log2_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
420 {
421  float* bPtr = bVector;
422  const float* aPtr = aVector;
423  unsigned int number;
424  const unsigned int quarterPoints = num_points / 4;
425 
426  int32x4_t aval;
427  float32x4_t log2_approx;
428 
430  // lms
431  // p0 = vdupq_n_f32(-1.649132280361871);
432  // p1 = vdupq_n_f32(1.995047138579499);
433  // p2 = vdupq_n_f32(-0.336914839219728);
434 
435  // keep in mind a single precision float is represented as
436  // (-1)^sign * 2^exp * 1.significand, so the log2 is
437  // log2(2^exp * sig) = exponent + log2(1 + significand/(1<<23)
438  for (number = 0; number < quarterPoints; ++number) {
439  // load float in to an int register without conversion
440  aval = vld1q_s32((int*)aPtr);
441 
442  VLOG2Q_NEON_F32(log2_approx, aval)
443 
444  vst1q_f32(bPtr, log2_approx);
445 
446  aPtr += 4;
447  bPtr += 4;
448  }
449 
450  number = quarterPoints * 4;
451  volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
452 }
453 
454 #endif /* LV_HAVE_NEON */
455 
456 
457 #endif /* INCLUDED_volk_32f_log2_32f_a_H */
458 
459 #ifndef INCLUDED_volk_32f_log2_32f_u_H
460 #define INCLUDED_volk_32f_log2_32f_u_H
461 
462 
463 #ifdef LV_HAVE_SSE4_1
464 #include <smmintrin.h>
465 
466 #define POLY0(x, c0) _mm_set1_ps(c0)
467 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
468 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
469 #define POLY3(x, c0, c1, c2, c3) \
470  _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
471 #define POLY4(x, c0, c1, c2, c3, c4) \
472  _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
473 #define POLY5(x, c0, c1, c2, c3, c4, c5) \
474  _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
475 
476 static inline void
477 volk_32f_log2_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
478 {
479  float* bPtr = bVector;
480  const float* aPtr = aVector;
481 
482  unsigned int number = 0;
483  const unsigned int quarterPoints = num_points / 4;
484 
485  __m128 aVal, bVal, mantissa, frac, leadingOne;
486  __m128i bias, exp;
487 
488  for (; number < quarterPoints; number++) {
489 
490  aVal = _mm_loadu_ps(aPtr);
491  bias = _mm_set1_epi32(127);
492  leadingOne = _mm_set1_ps(1.0f);
493  exp = _mm_sub_epi32(
494  _mm_srli_epi32(
495  _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
496  bias);
497  bVal = _mm_cvtepi32_ps(exp);
498 
499  // Now to extract mantissa
500  frac = _mm_or_ps(leadingOne,
501  _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
502 
503 #if LOG_POLY_DEGREE == 6
504  mantissa = POLY5(frac,
505  3.1157899f,
506  -3.3241990f,
507  2.5988452f,
508  -1.2315303f,
509  3.1821337e-1f,
510  -3.4436006e-2f);
511 #elif LOG_POLY_DEGREE == 5
512  mantissa = POLY4(frac,
513  2.8882704548164776201f,
514  -2.52074962577807006663f,
515  1.48116647521213171641f,
516  -0.465725644288844778798f,
517  0.0596515482674574969533f);
518 #elif LOG_POLY_DEGREE == 4
519  mantissa = POLY3(frac,
520  2.61761038894603480148f,
521  -1.75647175389045657003f,
522  0.688243882994381274313f,
523  -0.107254423828329604454f);
524 #elif LOG_POLY_DEGREE == 3
525  mantissa = POLY2(frac,
526  2.28330284476918490682f,
527  -1.04913055217340124191f,
528  0.204446009836232697516f);
529 #else
530 #error
531 #endif
532 
533  bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
534  _mm_storeu_ps(bPtr, bVal);
535 
536  aPtr += 4;
537  bPtr += 4;
538  }
539 
540  number = quarterPoints * 4;
541  volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
542 }
543 
544 #endif /* LV_HAVE_SSE4_1 for unaligned */
545 
546 #if LV_HAVE_AVX2 && LV_HAVE_FMA
547 #include <immintrin.h>
548 
549 #define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0)
550 #define POLY1_FMAAVX2(x, c0, c1) \
551  _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
552 #define POLY2_FMAAVX2(x, c0, c1, c2) \
553  _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
554 #define POLY3_FMAAVX2(x, c0, c1, c2, c3) \
555  _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
556 #define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) \
557  _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
558 #define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) \
559  _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
560 
561 static inline void volk_32f_log2_32f_u_avx2_fma(float* bVector,
562  const float* aVector,
563  unsigned int num_points)
564 {
565  float* bPtr = bVector;
566  const float* aPtr = aVector;
567 
568  unsigned int number = 0;
569  const unsigned int eighthPoints = num_points / 8;
570 
571  __m256 aVal, bVal, mantissa, frac, leadingOne;
572  __m256i bias, exp;
573 
574  for (; number < eighthPoints; number++) {
575 
576  aVal = _mm256_loadu_ps(aPtr);
577  bias = _mm256_set1_epi32(127);
578  leadingOne = _mm256_set1_ps(1.0f);
579  exp = _mm256_sub_epi32(
580  _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
581  _mm256_set1_epi32(0x7f800000)),
582  23),
583  bias);
584  bVal = _mm256_cvtepi32_ps(exp);
585 
586  // Now to extract mantissa
587  frac = _mm256_or_ps(
588  leadingOne,
589  _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
590 
591 #if LOG_POLY_DEGREE == 6
592  mantissa = POLY5_FMAAVX2(frac,
593  3.1157899f,
594  -3.3241990f,
595  2.5988452f,
596  -1.2315303f,
597  3.1821337e-1f,
598  -3.4436006e-2f);
599 #elif LOG_POLY_DEGREE == 5
600  mantissa = POLY4_FMAAVX2(frac,
601  2.8882704548164776201f,
602  -2.52074962577807006663f,
603  1.48116647521213171641f,
604  -0.465725644288844778798f,
605  0.0596515482674574969533f);
606 #elif LOG_POLY_DEGREE == 4
607  mantissa = POLY3_FMAAVX2(frac,
608  2.61761038894603480148f,
609  -1.75647175389045657003f,
610  0.688243882994381274313f,
611  -0.107254423828329604454f);
612 #elif LOG_POLY_DEGREE == 3
613  mantissa = POLY2_FMAAVX2(frac,
614  2.28330284476918490682f,
615  -1.04913055217340124191f,
616  0.204446009836232697516f);
617 #else
618 #error
619 #endif
620 
621  bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
622  _mm256_storeu_ps(bPtr, bVal);
623 
624  aPtr += 8;
625  bPtr += 8;
626  }
627 
628  number = eighthPoints * 8;
629  volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
630 }
631 
632 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
633 
634 #ifdef LV_HAVE_AVX2
635 #include <immintrin.h>
636 
637 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
638 #define POLY1_AVX2(x, c0, c1) \
639  _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
640 #define POLY2_AVX2(x, c0, c1, c2) \
641  _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
642 #define POLY3_AVX2(x, c0, c1, c2, c3) \
643  _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
644 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
645  _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
646 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
647  _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
648 
649 static inline void
650 volk_32f_log2_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
651 {
652  float* bPtr = bVector;
653  const float* aPtr = aVector;
654 
655  unsigned int number = 0;
656  const unsigned int eighthPoints = num_points / 8;
657 
658  __m256 aVal, bVal, mantissa, frac, leadingOne;
659  __m256i bias, exp;
660 
661  for (; number < eighthPoints; number++) {
662 
663  aVal = _mm256_loadu_ps(aPtr);
664  bias = _mm256_set1_epi32(127);
665  leadingOne = _mm256_set1_ps(1.0f);
666  exp = _mm256_sub_epi32(
667  _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
668  _mm256_set1_epi32(0x7f800000)),
669  23),
670  bias);
671  bVal = _mm256_cvtepi32_ps(exp);
672 
673  // Now to extract mantissa
674  frac = _mm256_or_ps(
675  leadingOne,
676  _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
677 
678 #if LOG_POLY_DEGREE == 6
679  mantissa = POLY5_AVX2(frac,
680  3.1157899f,
681  -3.3241990f,
682  2.5988452f,
683  -1.2315303f,
684  3.1821337e-1f,
685  -3.4436006e-2f);
686 #elif LOG_POLY_DEGREE == 5
687  mantissa = POLY4_AVX2(frac,
688  2.8882704548164776201f,
689  -2.52074962577807006663f,
690  1.48116647521213171641f,
691  -0.465725644288844778798f,
692  0.0596515482674574969533f);
693 #elif LOG_POLY_DEGREE == 4
694  mantissa = POLY3_AVX2(frac,
695  2.61761038894603480148f,
696  -1.75647175389045657003f,
697  0.688243882994381274313f,
698  -0.107254423828329604454f);
699 #elif LOG_POLY_DEGREE == 3
700  mantissa = POLY2_AVX2(frac,
701  2.28330284476918490682f,
702  -1.04913055217340124191f,
703  0.204446009836232697516f);
704 #else
705 #error
706 #endif
707 
708  bVal =
709  _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
710  _mm256_storeu_ps(bPtr, bVal);
711 
712  aPtr += 8;
713  bPtr += 8;
714  }
715 
716  number = eighthPoints * 8;
717  volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
718 }
719 
720 #endif /* LV_HAVE_AVX2 for unaligned */
721 
722 #ifdef LV_HAVE_RVV
723 #include <riscv_vector.h>
724 
725 static inline void
726 volk_32f_log2_32f_rvv(float* bVector, const float* aVector, unsigned int num_points)
727 {
728  size_t vlmax = __riscv_vsetvlmax_e32m2();
729 
730 #if LOG_POLY_DEGREE == 6
731  const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(3.1157899f, vlmax);
732  const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(-3.3241990f, vlmax);
733  const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(2.5988452f, vlmax);
734  const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(-1.2315303f, vlmax);
735  const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(3.1821337e-1f, vlmax);
736  const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(-3.4436006e-2f, vlmax);
737 #elif LOG_POLY_DEGREE == 5
738  const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(2.8882704548164776201f, vlmax);
739  const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(-2.52074962577807006663f, vlmax);
740  const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(1.48116647521213171641f, vlmax);
741  const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(-0.465725644288844778798f, vlmax);
742  const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(0.0596515482674574969533f, vlmax);
743 #elif LOG_POLY_DEGREE == 4
744  const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(2.61761038894603480148f, vlmax);
745  const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(-1.75647175389045657003f, vlmax);
746  const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(0.688243882994381274313f, vlmax);
747  const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(-0.107254423828329604454f, vlmax);
748 #elif LOG_POLY_DEGREE == 3
749  const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(2.28330284476918490682f, vlmax);
750  const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(-1.04913055217340124191f, vlmax);
751  const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(0.204446009836232697516f, vlmax);
752 #else
753 #error
754 #endif
755 
756  const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax);
757  const vint32m2_t m1 = __riscv_vreinterpret_i32m2(cf1);
758  const vint32m2_t m2 = __riscv_vmv_v_x_i32m2(0x7FFFFF, vlmax);
759  const vint32m2_t c127 = __riscv_vmv_v_x_i32m2(127, vlmax);
760 
761  size_t n = num_points;
762  for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) {
763  vl = __riscv_vsetvl_e32m2(n);
764  vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl);
765  vfloat32m2_t a = __riscv_vfabs(v, vl);
766  vfloat32m2_t exp = __riscv_vfcvt_f(
767  __riscv_vsub(__riscv_vsra(__riscv_vreinterpret_i32m2(a), 23, vl), c127, vl),
768  vl);
769  vfloat32m2_t frac = __riscv_vreinterpret_f32m2(
770  __riscv_vor(__riscv_vand(__riscv_vreinterpret_i32m2(v), m2, vl), m1, vl));
771 
772  vfloat32m2_t mant = c0;
773  mant = __riscv_vfmadd(mant, frac, c1, vl);
774  mant = __riscv_vfmadd(mant, frac, c2, vl);
775 #if LOG_POLY_DEGREE >= 4
776  mant = __riscv_vfmadd(mant, frac, c3, vl);
777 #if LOG_POLY_DEGREE >= 5
778  mant = __riscv_vfmadd(mant, frac, c4, vl);
779 #if LOG_POLY_DEGREE >= 6
780  mant = __riscv_vfmadd(mant, frac, c5, vl);
781 #endif
782 #endif
783 #endif
784  exp = __riscv_vfmacc(exp, mant, __riscv_vfsub(frac, cf1, vl), vl);
785 
786  __riscv_vse32(bVector, exp, vl);
787  }
788 }
789 #endif /*LV_HAVE_RVV*/
790 
791 #endif /* INCLUDED_volk_32f_log2_32f_u_H */
static void volk_32f_log2_32f_neon(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_log2_32f.h:416
static void volk_32f_log2_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_log2_32f.h:92
#define VLOG2Q_NEON_F32(log2_approx, aval)
Definition: volk_32f_log2_32f.h:382
#define VLOG2Q_NEON_PREAMBLE()
Definition: volk_32f_log2_32f.h:367
static float log2f_non_ieee(float f)
Definition: volk_common.h:155