Vector Optimized Library of Kernels  3.2.0
Architecture-tuned implementations of math kernels
volk_32f_x2_pow_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
60 #ifndef INCLUDED_volk_32f_x2_pow_32f_a_H
61 #define INCLUDED_volk_32f_x2_pow_32f_a_H
62 
63 #include <inttypes.h>
64 #include <math.h>
65 #include <stdio.h>
66 #include <stdlib.h>
67 
68 #define POW_POLY_DEGREE 3
69 
70 #if LV_HAVE_AVX2 && LV_HAVE_FMA
71 #include <immintrin.h>
72 
73 #define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)
74 #define POLY1_AVX2_FMA(x, c0, c1) \
75  _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
76 #define POLY2_AVX2_FMA(x, c0, c1, c2) \
77  _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
78 #define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \
79  _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
80 #define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \
81  _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
82 #define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \
83  _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
84 
85 static inline void volk_32f_x2_pow_32f_a_avx2_fma(float* cVector,
86  const float* bVector,
87  const float* aVector,
88  unsigned int num_points)
89 {
90  float* cPtr = cVector;
91  const float* bPtr = bVector;
92  const float* aPtr = aVector;
93 
94  unsigned int number = 0;
95  const unsigned int eighthPoints = num_points / 8;
96 
97  __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
98  __m256 tmp, fx, mask, pow2n, z, y;
99  __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
100  __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
101  __m256i bias, exp, emm0, pi32_0x7f;
102 
103  one = _mm256_set1_ps(1.0);
104  exp_hi = _mm256_set1_ps(88.3762626647949);
105  exp_lo = _mm256_set1_ps(-88.3762626647949);
106  ln2 = _mm256_set1_ps(0.6931471805);
107  log2EF = _mm256_set1_ps(1.44269504088896341);
108  half = _mm256_set1_ps(0.5);
109  exp_C1 = _mm256_set1_ps(0.693359375);
110  exp_C2 = _mm256_set1_ps(-2.12194440e-4);
111  pi32_0x7f = _mm256_set1_epi32(0x7f);
112 
113  exp_p0 = _mm256_set1_ps(1.9875691500e-4);
114  exp_p1 = _mm256_set1_ps(1.3981999507e-3);
115  exp_p2 = _mm256_set1_ps(8.3334519073e-3);
116  exp_p3 = _mm256_set1_ps(4.1665795894e-2);
117  exp_p4 = _mm256_set1_ps(1.6666665459e-1);
118  exp_p5 = _mm256_set1_ps(5.0000001201e-1);
119 
120  for (; number < eighthPoints; number++) {
121  // First compute the logarithm
122  aVal = _mm256_load_ps(aPtr);
123  bias = _mm256_set1_epi32(127);
124  leadingOne = _mm256_set1_ps(1.0f);
125  exp = _mm256_sub_epi32(
126  _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
127  _mm256_set1_epi32(0x7f800000)),
128  23),
129  bias);
130  logarithm = _mm256_cvtepi32_ps(exp);
131 
132  frac = _mm256_or_ps(
133  leadingOne,
134  _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
135 
136 #if POW_POLY_DEGREE == 6
137  mantissa = POLY5_AVX2_FMA(frac,
138  3.1157899f,
139  -3.3241990f,
140  2.5988452f,
141  -1.2315303f,
142  3.1821337e-1f,
143  -3.4436006e-2f);
144 #elif POW_POLY_DEGREE == 5
145  mantissa = POLY4_AVX2_FMA(frac,
146  2.8882704548164776201f,
147  -2.52074962577807006663f,
148  1.48116647521213171641f,
149  -0.465725644288844778798f,
150  0.0596515482674574969533f);
151 #elif POW_POLY_DEGREE == 4
152  mantissa = POLY3_AVX2_FMA(frac,
153  2.61761038894603480148f,
154  -1.75647175389045657003f,
155  0.688243882994381274313f,
156  -0.107254423828329604454f);
157 #elif POW_POLY_DEGREE == 3
158  mantissa = POLY2_AVX2_FMA(frac,
159  2.28330284476918490682f,
160  -1.04913055217340124191f,
161  0.204446009836232697516f);
162 #else
163 #error
164 #endif
165 
166  logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
167  logarithm = _mm256_mul_ps(logarithm, ln2);
168 
169  // Now calculate b*lna
170  bVal = _mm256_load_ps(bPtr);
171  bVal = _mm256_mul_ps(bVal, logarithm);
172 
173  // Now compute exp(b*lna)
174  bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
175 
176  fx = _mm256_fmadd_ps(bVal, log2EF, half);
177 
178  emm0 = _mm256_cvttps_epi32(fx);
179  tmp = _mm256_cvtepi32_ps(emm0);
180 
181  mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
182  fx = _mm256_sub_ps(tmp, mask);
183 
184  tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
185  bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
186  z = _mm256_mul_ps(bVal, bVal);
187 
188  y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
189  y = _mm256_fmadd_ps(y, bVal, exp_p2);
190  y = _mm256_fmadd_ps(y, bVal, exp_p3);
191  y = _mm256_fmadd_ps(y, bVal, exp_p4);
192  y = _mm256_fmadd_ps(y, bVal, exp_p5);
193  y = _mm256_fmadd_ps(y, z, bVal);
194  y = _mm256_add_ps(y, one);
195 
196  emm0 =
197  _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
198 
199  pow2n = _mm256_castsi256_ps(emm0);
200  cVal = _mm256_mul_ps(y, pow2n);
201 
202  _mm256_store_ps(cPtr, cVal);
203 
204  aPtr += 8;
205  bPtr += 8;
206  cPtr += 8;
207  }
208 
209  number = eighthPoints * 8;
210  for (; number < num_points; number++) {
211  *cPtr++ = pow(*aPtr++, *bPtr++);
212  }
213 }
214 
215 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
216 
217 #ifdef LV_HAVE_AVX2
218 #include <immintrin.h>
219 
220 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
221 #define POLY1_AVX2(x, c0, c1) \
222  _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
223 #define POLY2_AVX2(x, c0, c1, c2) \
224  _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
225 #define POLY3_AVX2(x, c0, c1, c2, c3) \
226  _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
227 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
228  _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
229 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
230  _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
231 
232 static inline void volk_32f_x2_pow_32f_a_avx2(float* cVector,
233  const float* bVector,
234  const float* aVector,
235  unsigned int num_points)
236 {
237  float* cPtr = cVector;
238  const float* bPtr = bVector;
239  const float* aPtr = aVector;
240 
241  unsigned int number = 0;
242  const unsigned int eighthPoints = num_points / 8;
243 
244  __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
245  __m256 tmp, fx, mask, pow2n, z, y;
246  __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
247  __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
248  __m256i bias, exp, emm0, pi32_0x7f;
249 
250  one = _mm256_set1_ps(1.0);
251  exp_hi = _mm256_set1_ps(88.3762626647949);
252  exp_lo = _mm256_set1_ps(-88.3762626647949);
253  ln2 = _mm256_set1_ps(0.6931471805);
254  log2EF = _mm256_set1_ps(1.44269504088896341);
255  half = _mm256_set1_ps(0.5);
256  exp_C1 = _mm256_set1_ps(0.693359375);
257  exp_C2 = _mm256_set1_ps(-2.12194440e-4);
258  pi32_0x7f = _mm256_set1_epi32(0x7f);
259 
260  exp_p0 = _mm256_set1_ps(1.9875691500e-4);
261  exp_p1 = _mm256_set1_ps(1.3981999507e-3);
262  exp_p2 = _mm256_set1_ps(8.3334519073e-3);
263  exp_p3 = _mm256_set1_ps(4.1665795894e-2);
264  exp_p4 = _mm256_set1_ps(1.6666665459e-1);
265  exp_p5 = _mm256_set1_ps(5.0000001201e-1);
266 
267  for (; number < eighthPoints; number++) {
268  // First compute the logarithm
269  aVal = _mm256_load_ps(aPtr);
270  bias = _mm256_set1_epi32(127);
271  leadingOne = _mm256_set1_ps(1.0f);
272  exp = _mm256_sub_epi32(
273  _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
274  _mm256_set1_epi32(0x7f800000)),
275  23),
276  bias);
277  logarithm = _mm256_cvtepi32_ps(exp);
278 
279  frac = _mm256_or_ps(
280  leadingOne,
281  _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
282 
283 #if POW_POLY_DEGREE == 6
284  mantissa = POLY5_AVX2(frac,
285  3.1157899f,
286  -3.3241990f,
287  2.5988452f,
288  -1.2315303f,
289  3.1821337e-1f,
290  -3.4436006e-2f);
291 #elif POW_POLY_DEGREE == 5
292  mantissa = POLY4_AVX2(frac,
293  2.8882704548164776201f,
294  -2.52074962577807006663f,
295  1.48116647521213171641f,
296  -0.465725644288844778798f,
297  0.0596515482674574969533f);
298 #elif POW_POLY_DEGREE == 4
299  mantissa = POLY3_AVX2(frac,
300  2.61761038894603480148f,
301  -1.75647175389045657003f,
302  0.688243882994381274313f,
303  -0.107254423828329604454f);
304 #elif POW_POLY_DEGREE == 3
305  mantissa = POLY2_AVX2(frac,
306  2.28330284476918490682f,
307  -1.04913055217340124191f,
308  0.204446009836232697516f);
309 #else
310 #error
311 #endif
312 
313  logarithm = _mm256_add_ps(
314  _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
315  logarithm = _mm256_mul_ps(logarithm, ln2);
316 
317  // Now calculate b*lna
318  bVal = _mm256_load_ps(bPtr);
319  bVal = _mm256_mul_ps(bVal, logarithm);
320 
321  // Now compute exp(b*lna)
322  bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
323 
324  fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
325 
326  emm0 = _mm256_cvttps_epi32(fx);
327  tmp = _mm256_cvtepi32_ps(emm0);
328 
329  mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
330  fx = _mm256_sub_ps(tmp, mask);
331 
332  tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
333  bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
334  z = _mm256_mul_ps(bVal, bVal);
335 
336  y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
337  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
338  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
339  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
340  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
341  y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
342  y = _mm256_add_ps(y, one);
343 
344  emm0 =
345  _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
346 
347  pow2n = _mm256_castsi256_ps(emm0);
348  cVal = _mm256_mul_ps(y, pow2n);
349 
350  _mm256_store_ps(cPtr, cVal);
351 
352  aPtr += 8;
353  bPtr += 8;
354  cPtr += 8;
355  }
356 
357  number = eighthPoints * 8;
358  for (; number < num_points; number++) {
359  *cPtr++ = pow(*aPtr++, *bPtr++);
360  }
361 }
362 
363 #endif /* LV_HAVE_AVX2 for aligned */
364 
365 
366 #ifdef LV_HAVE_SSE4_1
367 #include <smmintrin.h>
368 
369 #define POLY0(x, c0) _mm_set1_ps(c0)
370 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
371 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
372 #define POLY3(x, c0, c1, c2, c3) \
373  _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
374 #define POLY4(x, c0, c1, c2, c3, c4) \
375  _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
376 #define POLY5(x, c0, c1, c2, c3, c4, c5) \
377  _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
378 
379 static inline void volk_32f_x2_pow_32f_a_sse4_1(float* cVector,
380  const float* bVector,
381  const float* aVector,
382  unsigned int num_points)
383 {
384  float* cPtr = cVector;
385  const float* bPtr = bVector;
386  const float* aPtr = aVector;
387 
388  unsigned int number = 0;
389  const unsigned int quarterPoints = num_points / 4;
390 
391  __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
392  __m128 tmp, fx, mask, pow2n, z, y;
393  __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
394  __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
395  __m128i bias, exp, emm0, pi32_0x7f;
396 
397  one = _mm_set1_ps(1.0);
398  exp_hi = _mm_set1_ps(88.3762626647949);
399  exp_lo = _mm_set1_ps(-88.3762626647949);
400  ln2 = _mm_set1_ps(0.6931471805);
401  log2EF = _mm_set1_ps(1.44269504088896341);
402  half = _mm_set1_ps(0.5);
403  exp_C1 = _mm_set1_ps(0.693359375);
404  exp_C2 = _mm_set1_ps(-2.12194440e-4);
405  pi32_0x7f = _mm_set1_epi32(0x7f);
406 
407  exp_p0 = _mm_set1_ps(1.9875691500e-4);
408  exp_p1 = _mm_set1_ps(1.3981999507e-3);
409  exp_p2 = _mm_set1_ps(8.3334519073e-3);
410  exp_p3 = _mm_set1_ps(4.1665795894e-2);
411  exp_p4 = _mm_set1_ps(1.6666665459e-1);
412  exp_p5 = _mm_set1_ps(5.0000001201e-1);
413 
414  for (; number < quarterPoints; number++) {
415  // First compute the logarithm
416  aVal = _mm_load_ps(aPtr);
417  bias = _mm_set1_epi32(127);
418  leadingOne = _mm_set1_ps(1.0f);
419  exp = _mm_sub_epi32(
420  _mm_srli_epi32(
421  _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
422  bias);
423  logarithm = _mm_cvtepi32_ps(exp);
424 
425  frac = _mm_or_ps(leadingOne,
426  _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
427 
428 #if POW_POLY_DEGREE == 6
429  mantissa = POLY5(frac,
430  3.1157899f,
431  -3.3241990f,
432  2.5988452f,
433  -1.2315303f,
434  3.1821337e-1f,
435  -3.4436006e-2f);
436 #elif POW_POLY_DEGREE == 5
437  mantissa = POLY4(frac,
438  2.8882704548164776201f,
439  -2.52074962577807006663f,
440  1.48116647521213171641f,
441  -0.465725644288844778798f,
442  0.0596515482674574969533f);
443 #elif POW_POLY_DEGREE == 4
444  mantissa = POLY3(frac,
445  2.61761038894603480148f,
446  -1.75647175389045657003f,
447  0.688243882994381274313f,
448  -0.107254423828329604454f);
449 #elif POW_POLY_DEGREE == 3
450  mantissa = POLY2(frac,
451  2.28330284476918490682f,
452  -1.04913055217340124191f,
453  0.204446009836232697516f);
454 #else
455 #error
456 #endif
457 
458  logarithm =
459  _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
460  logarithm = _mm_mul_ps(logarithm, ln2);
461 
462 
463  // Now calculate b*lna
464  bVal = _mm_load_ps(bPtr);
465  bVal = _mm_mul_ps(bVal, logarithm);
466 
467  // Now compute exp(b*lna)
468  bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
469 
470  fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
471 
472  emm0 = _mm_cvttps_epi32(fx);
473  tmp = _mm_cvtepi32_ps(emm0);
474 
475  mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
476  fx = _mm_sub_ps(tmp, mask);
477 
478  tmp = _mm_mul_ps(fx, exp_C1);
479  z = _mm_mul_ps(fx, exp_C2);
480  bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
481  z = _mm_mul_ps(bVal, bVal);
482 
483  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
484  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
485  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
486  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
487  y = _mm_add_ps(y, one);
488 
489  emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
490 
491  pow2n = _mm_castsi128_ps(emm0);
492  cVal = _mm_mul_ps(y, pow2n);
493 
494  _mm_store_ps(cPtr, cVal);
495 
496  aPtr += 4;
497  bPtr += 4;
498  cPtr += 4;
499  }
500 
501  number = quarterPoints * 4;
502  for (; number < num_points; number++) {
503  *cPtr++ = powf(*aPtr++, *bPtr++);
504  }
505 }
506 
507 #endif /* LV_HAVE_SSE4_1 for aligned */
508 
509 #endif /* INCLUDED_volk_32f_x2_pow_32f_a_H */
510 
511 #ifndef INCLUDED_volk_32f_x2_pow_32f_u_H
512 #define INCLUDED_volk_32f_x2_pow_32f_u_H
513 
514 #include <inttypes.h>
515 #include <math.h>
516 #include <stdio.h>
517 #include <stdlib.h>
518 
519 #define POW_POLY_DEGREE 3
520 
521 #ifdef LV_HAVE_GENERIC
522 
523 static inline void volk_32f_x2_pow_32f_generic(float* cVector,
524  const float* bVector,
525  const float* aVector,
526  unsigned int num_points)
527 {
528  float* cPtr = cVector;
529  const float* bPtr = bVector;
530  const float* aPtr = aVector;
531  unsigned int number = 0;
532 
533  for (number = 0; number < num_points; number++) {
534  *cPtr++ = powf(*aPtr++, *bPtr++);
535  }
536 }
537 #endif /* LV_HAVE_GENERIC */
538 
539 
540 #ifdef LV_HAVE_SSE4_1
541 #include <smmintrin.h>
542 
543 #define POLY0(x, c0) _mm_set1_ps(c0)
544 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
545 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
546 #define POLY3(x, c0, c1, c2, c3) \
547  _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
548 #define POLY4(x, c0, c1, c2, c3, c4) \
549  _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
550 #define POLY5(x, c0, c1, c2, c3, c4, c5) \
551  _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
552 
553 static inline void volk_32f_x2_pow_32f_u_sse4_1(float* cVector,
554  const float* bVector,
555  const float* aVector,
556  unsigned int num_points)
557 {
558  float* cPtr = cVector;
559  const float* bPtr = bVector;
560  const float* aPtr = aVector;
561 
562  unsigned int number = 0;
563  const unsigned int quarterPoints = num_points / 4;
564 
565  __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
566  __m128 tmp, fx, mask, pow2n, z, y;
567  __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
568  __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
569  __m128i bias, exp, emm0, pi32_0x7f;
570 
571  one = _mm_set1_ps(1.0);
572  exp_hi = _mm_set1_ps(88.3762626647949);
573  exp_lo = _mm_set1_ps(-88.3762626647949);
574  ln2 = _mm_set1_ps(0.6931471805);
575  log2EF = _mm_set1_ps(1.44269504088896341);
576  half = _mm_set1_ps(0.5);
577  exp_C1 = _mm_set1_ps(0.693359375);
578  exp_C2 = _mm_set1_ps(-2.12194440e-4);
579  pi32_0x7f = _mm_set1_epi32(0x7f);
580 
581  exp_p0 = _mm_set1_ps(1.9875691500e-4);
582  exp_p1 = _mm_set1_ps(1.3981999507e-3);
583  exp_p2 = _mm_set1_ps(8.3334519073e-3);
584  exp_p3 = _mm_set1_ps(4.1665795894e-2);
585  exp_p4 = _mm_set1_ps(1.6666665459e-1);
586  exp_p5 = _mm_set1_ps(5.0000001201e-1);
587 
588  for (; number < quarterPoints; number++) {
589  // First compute the logarithm
590  aVal = _mm_loadu_ps(aPtr);
591  bias = _mm_set1_epi32(127);
592  leadingOne = _mm_set1_ps(1.0f);
593  exp = _mm_sub_epi32(
594  _mm_srli_epi32(
595  _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
596  bias);
597  logarithm = _mm_cvtepi32_ps(exp);
598 
599  frac = _mm_or_ps(leadingOne,
600  _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
601 
602 #if POW_POLY_DEGREE == 6
603  mantissa = POLY5(frac,
604  3.1157899f,
605  -3.3241990f,
606  2.5988452f,
607  -1.2315303f,
608  3.1821337e-1f,
609  -3.4436006e-2f);
610 #elif POW_POLY_DEGREE == 5
611  mantissa = POLY4(frac,
612  2.8882704548164776201f,
613  -2.52074962577807006663f,
614  1.48116647521213171641f,
615  -0.465725644288844778798f,
616  0.0596515482674574969533f);
617 #elif POW_POLY_DEGREE == 4
618  mantissa = POLY3(frac,
619  2.61761038894603480148f,
620  -1.75647175389045657003f,
621  0.688243882994381274313f,
622  -0.107254423828329604454f);
623 #elif POW_POLY_DEGREE == 3
624  mantissa = POLY2(frac,
625  2.28330284476918490682f,
626  -1.04913055217340124191f,
627  0.204446009836232697516f);
628 #else
629 #error
630 #endif
631 
632  logarithm =
633  _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
634  logarithm = _mm_mul_ps(logarithm, ln2);
635 
636 
637  // Now calculate b*lna
638  bVal = _mm_loadu_ps(bPtr);
639  bVal = _mm_mul_ps(bVal, logarithm);
640 
641  // Now compute exp(b*lna)
642  bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
643 
644  fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
645 
646  emm0 = _mm_cvttps_epi32(fx);
647  tmp = _mm_cvtepi32_ps(emm0);
648 
649  mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
650  fx = _mm_sub_ps(tmp, mask);
651 
652  tmp = _mm_mul_ps(fx, exp_C1);
653  z = _mm_mul_ps(fx, exp_C2);
654  bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
655  z = _mm_mul_ps(bVal, bVal);
656 
657  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
658  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
659  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
660  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
661  y = _mm_add_ps(y, one);
662 
663  emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
664 
665  pow2n = _mm_castsi128_ps(emm0);
666  cVal = _mm_mul_ps(y, pow2n);
667 
668  _mm_storeu_ps(cPtr, cVal);
669 
670  aPtr += 4;
671  bPtr += 4;
672  cPtr += 4;
673  }
674 
675  number = quarterPoints * 4;
676  for (; number < num_points; number++) {
677  *cPtr++ = powf(*aPtr++, *bPtr++);
678  }
679 }
680 
681 #endif /* LV_HAVE_SSE4_1 for unaligned */
682 
683 #if LV_HAVE_AVX2 && LV_HAVE_FMA
684 #include <immintrin.h>
685 
686 #define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)
687 #define POLY1_AVX2_FMA(x, c0, c1) \
688  _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
689 #define POLY2_AVX2_FMA(x, c0, c1, c2) \
690  _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
691 #define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \
692  _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
693 #define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \
694  _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
695 #define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \
696  _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
697 
698 static inline void volk_32f_x2_pow_32f_u_avx2_fma(float* cVector,
699  const float* bVector,
700  const float* aVector,
701  unsigned int num_points)
702 {
703  float* cPtr = cVector;
704  const float* bPtr = bVector;
705  const float* aPtr = aVector;
706 
707  unsigned int number = 0;
708  const unsigned int eighthPoints = num_points / 8;
709 
710  __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
711  __m256 tmp, fx, mask, pow2n, z, y;
712  __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
713  __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
714  __m256i bias, exp, emm0, pi32_0x7f;
715 
716  one = _mm256_set1_ps(1.0);
717  exp_hi = _mm256_set1_ps(88.3762626647949);
718  exp_lo = _mm256_set1_ps(-88.3762626647949);
719  ln2 = _mm256_set1_ps(0.6931471805);
720  log2EF = _mm256_set1_ps(1.44269504088896341);
721  half = _mm256_set1_ps(0.5);
722  exp_C1 = _mm256_set1_ps(0.693359375);
723  exp_C2 = _mm256_set1_ps(-2.12194440e-4);
724  pi32_0x7f = _mm256_set1_epi32(0x7f);
725 
726  exp_p0 = _mm256_set1_ps(1.9875691500e-4);
727  exp_p1 = _mm256_set1_ps(1.3981999507e-3);
728  exp_p2 = _mm256_set1_ps(8.3334519073e-3);
729  exp_p3 = _mm256_set1_ps(4.1665795894e-2);
730  exp_p4 = _mm256_set1_ps(1.6666665459e-1);
731  exp_p5 = _mm256_set1_ps(5.0000001201e-1);
732 
733  for (; number < eighthPoints; number++) {
734  // First compute the logarithm
735  aVal = _mm256_loadu_ps(aPtr);
736  bias = _mm256_set1_epi32(127);
737  leadingOne = _mm256_set1_ps(1.0f);
738  exp = _mm256_sub_epi32(
739  _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
740  _mm256_set1_epi32(0x7f800000)),
741  23),
742  bias);
743  logarithm = _mm256_cvtepi32_ps(exp);
744 
745  frac = _mm256_or_ps(
746  leadingOne,
747  _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
748 
749 #if POW_POLY_DEGREE == 6
750  mantissa = POLY5_AVX2_FMA(frac,
751  3.1157899f,
752  -3.3241990f,
753  2.5988452f,
754  -1.2315303f,
755  3.1821337e-1f,
756  -3.4436006e-2f);
757 #elif POW_POLY_DEGREE == 5
758  mantissa = POLY4_AVX2_FMA(frac,
759  2.8882704548164776201f,
760  -2.52074962577807006663f,
761  1.48116647521213171641f,
762  -0.465725644288844778798f,
763  0.0596515482674574969533f);
764 #elif POW_POLY_DEGREE == 4
765  mantissa = POLY3_AVX2_FMA(frac,
766  2.61761038894603480148f,
767  -1.75647175389045657003f,
768  0.688243882994381274313f,
769  -0.107254423828329604454f);
770 #elif POW_POLY_DEGREE == 3
771  mantissa = POLY2_AVX2_FMA(frac,
772  2.28330284476918490682f,
773  -1.04913055217340124191f,
774  0.204446009836232697516f);
775 #else
776 #error
777 #endif
778 
779  logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
780  logarithm = _mm256_mul_ps(logarithm, ln2);
781 
782 
783  // Now calculate b*lna
784  bVal = _mm256_loadu_ps(bPtr);
785  bVal = _mm256_mul_ps(bVal, logarithm);
786 
787  // Now compute exp(b*lna)
788  bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
789 
790  fx = _mm256_fmadd_ps(bVal, log2EF, half);
791 
792  emm0 = _mm256_cvttps_epi32(fx);
793  tmp = _mm256_cvtepi32_ps(emm0);
794 
795  mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
796  fx = _mm256_sub_ps(tmp, mask);
797 
798  tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
799  bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
800  z = _mm256_mul_ps(bVal, bVal);
801 
802  y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
803  y = _mm256_fmadd_ps(y, bVal, exp_p2);
804  y = _mm256_fmadd_ps(y, bVal, exp_p3);
805  y = _mm256_fmadd_ps(y, bVal, exp_p4);
806  y = _mm256_fmadd_ps(y, bVal, exp_p5);
807  y = _mm256_fmadd_ps(y, z, bVal);
808  y = _mm256_add_ps(y, one);
809 
810  emm0 =
811  _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
812 
813  pow2n = _mm256_castsi256_ps(emm0);
814  cVal = _mm256_mul_ps(y, pow2n);
815 
816  _mm256_storeu_ps(cPtr, cVal);
817 
818  aPtr += 8;
819  bPtr += 8;
820  cPtr += 8;
821  }
822 
823  number = eighthPoints * 8;
824  for (; number < num_points; number++) {
825  *cPtr++ = pow(*aPtr++, *bPtr++);
826  }
827 }
828 
829 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
830 
831 #ifdef LV_HAVE_AVX2
832 #include <immintrin.h>
833 
834 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
835 #define POLY1_AVX2(x, c0, c1) \
836  _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
837 #define POLY2_AVX2(x, c0, c1, c2) \
838  _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
839 #define POLY3_AVX2(x, c0, c1, c2, c3) \
840  _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
841 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
842  _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
843 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
844  _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
845 
846 static inline void volk_32f_x2_pow_32f_u_avx2(float* cVector,
847  const float* bVector,
848  const float* aVector,
849  unsigned int num_points)
850 {
851  float* cPtr = cVector;
852  const float* bPtr = bVector;
853  const float* aPtr = aVector;
854 
855  unsigned int number = 0;
856  const unsigned int eighthPoints = num_points / 8;
857 
858  __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
859  __m256 tmp, fx, mask, pow2n, z, y;
860  __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
861  __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
862  __m256i bias, exp, emm0, pi32_0x7f;
863 
864  one = _mm256_set1_ps(1.0);
865  exp_hi = _mm256_set1_ps(88.3762626647949);
866  exp_lo = _mm256_set1_ps(-88.3762626647949);
867  ln2 = _mm256_set1_ps(0.6931471805);
868  log2EF = _mm256_set1_ps(1.44269504088896341);
869  half = _mm256_set1_ps(0.5);
870  exp_C1 = _mm256_set1_ps(0.693359375);
871  exp_C2 = _mm256_set1_ps(-2.12194440e-4);
872  pi32_0x7f = _mm256_set1_epi32(0x7f);
873 
874  exp_p0 = _mm256_set1_ps(1.9875691500e-4);
875  exp_p1 = _mm256_set1_ps(1.3981999507e-3);
876  exp_p2 = _mm256_set1_ps(8.3334519073e-3);
877  exp_p3 = _mm256_set1_ps(4.1665795894e-2);
878  exp_p4 = _mm256_set1_ps(1.6666665459e-1);
879  exp_p5 = _mm256_set1_ps(5.0000001201e-1);
880 
881  for (; number < eighthPoints; number++) {
882  // First compute the logarithm
883  aVal = _mm256_loadu_ps(aPtr);
884  bias = _mm256_set1_epi32(127);
885  leadingOne = _mm256_set1_ps(1.0f);
886  exp = _mm256_sub_epi32(
887  _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
888  _mm256_set1_epi32(0x7f800000)),
889  23),
890  bias);
891  logarithm = _mm256_cvtepi32_ps(exp);
892 
893  frac = _mm256_or_ps(
894  leadingOne,
895  _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
896 
897 #if POW_POLY_DEGREE == 6
898  mantissa = POLY5_AVX2(frac,
899  3.1157899f,
900  -3.3241990f,
901  2.5988452f,
902  -1.2315303f,
903  3.1821337e-1f,
904  -3.4436006e-2f);
905 #elif POW_POLY_DEGREE == 5
906  mantissa = POLY4_AVX2(frac,
907  2.8882704548164776201f,
908  -2.52074962577807006663f,
909  1.48116647521213171641f,
910  -0.465725644288844778798f,
911  0.0596515482674574969533f);
912 #elif POW_POLY_DEGREE == 4
913  mantissa = POLY3_AVX2(frac,
914  2.61761038894603480148f,
915  -1.75647175389045657003f,
916  0.688243882994381274313f,
917  -0.107254423828329604454f);
918 #elif POW_POLY_DEGREE == 3
919  mantissa = POLY2_AVX2(frac,
920  2.28330284476918490682f,
921  -1.04913055217340124191f,
922  0.204446009836232697516f);
923 #else
924 #error
925 #endif
926 
927  logarithm = _mm256_add_ps(
928  _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
929  logarithm = _mm256_mul_ps(logarithm, ln2);
930 
931  // Now calculate b*lna
932  bVal = _mm256_loadu_ps(bPtr);
933  bVal = _mm256_mul_ps(bVal, logarithm);
934 
935  // Now compute exp(b*lna)
936  bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
937 
938  fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
939 
940  emm0 = _mm256_cvttps_epi32(fx);
941  tmp = _mm256_cvtepi32_ps(emm0);
942 
943  mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
944  fx = _mm256_sub_ps(tmp, mask);
945 
946  tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
947  bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
948  z = _mm256_mul_ps(bVal, bVal);
949 
950  y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
951  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
952  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
953  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
954  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
955  y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
956  y = _mm256_add_ps(y, one);
957 
958  emm0 =
959  _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
960 
961  pow2n = _mm256_castsi256_ps(emm0);
962  cVal = _mm256_mul_ps(y, pow2n);
963 
964  _mm256_storeu_ps(cPtr, cVal);
965 
966  aPtr += 8;
967  bPtr += 8;
968  cPtr += 8;
969  }
970 
971  number = eighthPoints * 8;
972  for (; number < num_points; number++) {
973  *cPtr++ = pow(*aPtr++, *bPtr++);
974  }
975 }
976 
977 #endif /* LV_HAVE_AVX2 for unaligned */
978 
979 #ifdef LV_HAVE_RVV
980 #include <riscv_vector.h>
981 
982 static inline void volk_32f_x2_pow_32f_rvv(float* cVector,
983  const float* bVector,
984  const float* aVector,
985  unsigned int num_points)
986 {
987  size_t vlmax = __riscv_vsetvlmax_e32m1();
988 
989 #if POW_POLY_DEGREE == 6
990  const vfloat32m1_t cl5 = __riscv_vfmv_v_f_f32m1(3.1157899f, vlmax);
991  const vfloat32m1_t cl4 = __riscv_vfmv_v_f_f32m1(-3.3241990f, vlmax);
992  const vfloat32m1_t cl3 = __riscv_vfmv_v_f_f32m1(2.5988452f, vlmax);
993  const vfloat32m1_t cl2 = __riscv_vfmv_v_f_f32m1(-1.2315303f, vlmax);
994  const vfloat32m1_t cl1 = __riscv_vfmv_v_f_f32m1(3.1821337e-1f, vlmax);
995  const vfloat32m1_t cl0 = __riscv_vfmv_v_f_f32m1(-3.4436006e-2f, vlmax);
996 #elif POW_POLY_DEGREE == 5
997  const vfloat32m1_t cl4 = __riscv_vfmv_v_f_f32m1(2.8882704548164776201f, vlmax);
998  const vfloat32m1_t cl3 = __riscv_vfmv_v_f_f32m1(-2.52074962577807006663f, vlmax);
999  const vfloat32m1_t cl2 = __riscv_vfmv_v_f_f32m1(1.48116647521213171641f, vlmax);
1000  const vfloat32m1_t cl1 = __riscv_vfmv_v_f_f32m1(-0.465725644288844778798f, vlmax);
1001  const vfloat32m1_t cl0 = __riscv_vfmv_v_f_f32m1(0.0596515482674574969533f, vlmax);
1002 #elif POW_POLY_DEGREE == 4
1003  const vfloat32m1_t cl3 = __riscv_vfmv_v_f_f32m1(2.61761038894603480148f, vlmax);
1004  const vfloat32m1_t cl2 = __riscv_vfmv_v_f_f32m1(-1.75647175389045657003f, vlmax);
1005  const vfloat32m1_t cl1 = __riscv_vfmv_v_f_f32m1(0.688243882994381274313f, vlmax);
1006  const vfloat32m1_t cl0 = __riscv_vfmv_v_f_f32m1(-0.107254423828329604454f, vlmax);
1007 #elif POW_POLY_DEGREE == 3
1008  const vfloat32m1_t cl2 = __riscv_vfmv_v_f_f32m1(2.28330284476918490682f, vlmax);
1009  const vfloat32m1_t cl1 = __riscv_vfmv_v_f_f32m1(-1.04913055217340124191f, vlmax);
1010  const vfloat32m1_t cl0 = __riscv_vfmv_v_f_f32m1(0.204446009836232697516f, vlmax);
1011 #else
1012 #error
1013 #endif
1014 
1015  const vfloat32m1_t exp_hi = __riscv_vfmv_v_f_f32m1(88.376259f, vlmax);
1016  const vfloat32m1_t exp_lo = __riscv_vfmv_v_f_f32m1(-88.376259f, vlmax);
1017  const vfloat32m1_t log2EF = __riscv_vfmv_v_f_f32m1(1.442695f, vlmax);
1018  const vfloat32m1_t exp_C1 = __riscv_vfmv_v_f_f32m1(-0.6933594f, vlmax);
1019  const vfloat32m1_t exp_C2 = __riscv_vfmv_v_f_f32m1(0.000212194f, vlmax);
1020  const vfloat32m1_t cf1 = __riscv_vfmv_v_f_f32m1(1.0f, vlmax);
1021  const vfloat32m1_t cf1o2 = __riscv_vfmv_v_f_f32m1(0.5f, vlmax);
1022  const vfloat32m1_t ln2 = __riscv_vfmv_v_f_f32m1(0.6931471805f, vlmax);
1023 
1024  const vfloat32m1_t ce0 = __riscv_vfmv_v_f_f32m1(1.9875691500e-4, vlmax);
1025  const vfloat32m1_t ce1 = __riscv_vfmv_v_f_f32m1(1.3981999507e-3, vlmax);
1026  const vfloat32m1_t ce2 = __riscv_vfmv_v_f_f32m1(8.3334519073e-3, vlmax);
1027  const vfloat32m1_t ce3 = __riscv_vfmv_v_f_f32m1(4.1665795894e-2, vlmax);
1028  const vfloat32m1_t ce4 = __riscv_vfmv_v_f_f32m1(1.6666665459e-1, vlmax);
1029  const vfloat32m1_t ce5 = __riscv_vfmv_v_f_f32m1(5.0000001201e-1, vlmax);
1030 
1031  const vint32m1_t m1 = __riscv_vreinterpret_i32m1(cf1);
1032  const vint32m1_t m2 = __riscv_vmv_v_x_i32m1(0x7FFFFF, vlmax);
1033  const vint32m1_t c127 = __riscv_vmv_v_x_i32m1(127, vlmax);
1034 
1035  size_t n = num_points;
1036  for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
1037  vl = __riscv_vsetvl_e32m1(n);
1038  vfloat32m1_t va = __riscv_vle32_v_f32m1(aVector, vl);
1039  vfloat32m1_t log;
1040 
1041  { /* log(a) */
1042  vfloat32m1_t a = __riscv_vfabs(va, vl);
1043  vfloat32m1_t exp = __riscv_vfcvt_f(
1044  __riscv_vsub(
1045  __riscv_vsra(__riscv_vreinterpret_i32m1(a), 23, vl), c127, vl),
1046  vl);
1047  vfloat32m1_t frac = __riscv_vreinterpret_f32m1(__riscv_vor(
1048  __riscv_vand(__riscv_vreinterpret_i32m1(va), m2, vl), m1, vl));
1049 
1050  vfloat32m1_t mant = cl0;
1051  mant = __riscv_vfmadd(mant, frac, cl1, vl);
1052  mant = __riscv_vfmadd(mant, frac, cl2, vl);
1053 #if POW_POLY_DEGREE >= 4
1054  mant = __riscv_vfmadd(mant, frac, cl3, vl);
1055 #if POW_POLY_DEGREE >= 5
1056  mant = __riscv_vfmadd(mant, frac, cl4, vl);
1057 #if POW_POLY_DEGREE >= 6
1058  mant = __riscv_vfmadd(mant, frac, cl5, vl);
1059 #endif
1060 #endif
1061 #endif
1062  log = __riscv_vfmacc(exp, mant, __riscv_vfsub(frac, cf1, vl), vl);
1063  log = __riscv_vfmul(log, ln2, vl);
1064  }
1065 
1066  vfloat32m1_t vb = __riscv_vle32_v_f32m1(bVector, vl);
1067  vb = __riscv_vfmul(vb, log, vl); /* b*log(a) */
1068  vfloat32m1_t exp;
1069 
1070  { /* exp(b*log(a)) */
1071  vb = __riscv_vfmin(vb, exp_hi, vl);
1072  vb = __riscv_vfmax(vb, exp_lo, vl);
1073  vfloat32m1_t fx = __riscv_vfmadd(vb, log2EF, cf1o2, vl);
1074 
1075  vfloat32m1_t rtz = __riscv_vfcvt_f(__riscv_vfcvt_rtz_x(fx, vl), vl);
1076  fx = __riscv_vfsub_mu(__riscv_vmfgt(rtz, fx, vl), rtz, rtz, cf1, vl);
1077  vb = __riscv_vfmacc(vb, exp_C1, fx, vl);
1078  vb = __riscv_vfmacc(vb, exp_C2, fx, vl);
1079  vfloat32m1_t vv = __riscv_vfmul(vb, vb, vl);
1080 
1081  vfloat32m1_t y = ce0;
1082  y = __riscv_vfmadd(y, vb, ce1, vl);
1083  y = __riscv_vfmadd(y, vb, ce2, vl);
1084  y = __riscv_vfmadd(y, vb, ce3, vl);
1085  y = __riscv_vfmadd(y, vb, ce4, vl);
1086  y = __riscv_vfmadd(y, vb, ce5, vl);
1087  y = __riscv_vfmadd(y, vv, vb, vl);
1088  y = __riscv_vfadd(y, cf1, vl);
1089 
1090  vfloat32m1_t pow2n = __riscv_vreinterpret_f32m1(__riscv_vsll(
1091  __riscv_vadd(__riscv_vfcvt_rtz_x(fx, vl), c127, vl), 23, vl));
1092 
1093  exp = __riscv_vfmul(y, pow2n, vl);
1094  }
1095 
1096  __riscv_vse32(cVector, exp, vl);
1097  }
1098 }
1099 
1100 #endif /*LV_HAVE_RVV*/
1101 
1102 #endif /* INCLUDED_volk_32f_x2_log2_32f_u_H */
static void volk_32f_x2_pow_32f_generic(float *cVector, const float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_x2_pow_32f.h:523