Vector Optimized Library of Kernels 3.0.0
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_32f_x2_pow_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
58#ifndef INCLUDED_volk_32f_x2_pow_32f_a_H
59#define INCLUDED_volk_32f_x2_pow_32f_a_H
60
61#include <inttypes.h>
62#include <math.h>
63#include <stdio.h>
64#include <stdlib.h>
65
66#define POW_POLY_DEGREE 3
67
68#if LV_HAVE_AVX2 && LV_HAVE_FMA
69#include <immintrin.h>
70
71#define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)
72#define POLY1_AVX2_FMA(x, c0, c1) \
73 _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
74#define POLY2_AVX2_FMA(x, c0, c1, c2) \
75 _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
76#define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \
77 _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
78#define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \
79 _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
80#define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \
81 _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
82
83static inline void volk_32f_x2_pow_32f_a_avx2_fma(float* cVector,
84 const float* bVector,
85 const float* aVector,
86 unsigned int num_points)
87{
88 float* cPtr = cVector;
89 const float* bPtr = bVector;
90 const float* aPtr = aVector;
91
92 unsigned int number = 0;
93 const unsigned int eighthPoints = num_points / 8;
94
95 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
96 __m256 tmp, fx, mask, pow2n, z, y;
97 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
98 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
99 __m256i bias, exp, emm0, pi32_0x7f;
100
101 one = _mm256_set1_ps(1.0);
102 exp_hi = _mm256_set1_ps(88.3762626647949);
103 exp_lo = _mm256_set1_ps(-88.3762626647949);
104 ln2 = _mm256_set1_ps(0.6931471805);
105 log2EF = _mm256_set1_ps(1.44269504088896341);
106 half = _mm256_set1_ps(0.5);
107 exp_C1 = _mm256_set1_ps(0.693359375);
108 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
109 pi32_0x7f = _mm256_set1_epi32(0x7f);
110
111 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
112 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
113 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
114 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
115 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
116 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
117
118 for (; number < eighthPoints; number++) {
119 // First compute the logarithm
120 aVal = _mm256_load_ps(aPtr);
121 bias = _mm256_set1_epi32(127);
122 leadingOne = _mm256_set1_ps(1.0f);
123 exp = _mm256_sub_epi32(
124 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
125 _mm256_set1_epi32(0x7f800000)),
126 23),
127 bias);
128 logarithm = _mm256_cvtepi32_ps(exp);
129
130 frac = _mm256_or_ps(
131 leadingOne,
132 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
133
134#if POW_POLY_DEGREE == 6
135 mantissa = POLY5_AVX2_FMA(frac,
136 3.1157899f,
137 -3.3241990f,
138 2.5988452f,
139 -1.2315303f,
140 3.1821337e-1f,
141 -3.4436006e-2f);
142#elif POW_POLY_DEGREE == 5
143 mantissa = POLY4_AVX2_FMA(frac,
144 2.8882704548164776201f,
145 -2.52074962577807006663f,
146 1.48116647521213171641f,
147 -0.465725644288844778798f,
148 0.0596515482674574969533f);
149#elif POW_POLY_DEGREE == 4
150 mantissa = POLY3_AVX2_FMA(frac,
151 2.61761038894603480148f,
152 -1.75647175389045657003f,
153 0.688243882994381274313f,
154 -0.107254423828329604454f);
155#elif POW_POLY_DEGREE == 3
156 mantissa = POLY2_AVX2_FMA(frac,
157 2.28330284476918490682f,
158 -1.04913055217340124191f,
159 0.204446009836232697516f);
160#else
161#error
162#endif
163
164 logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
165 logarithm = _mm256_mul_ps(logarithm, ln2);
166
167 // Now calculate b*lna
168 bVal = _mm256_load_ps(bPtr);
169 bVal = _mm256_mul_ps(bVal, logarithm);
170
171 // Now compute exp(b*lna)
172 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
173
174 fx = _mm256_fmadd_ps(bVal, log2EF, half);
175
176 emm0 = _mm256_cvttps_epi32(fx);
177 tmp = _mm256_cvtepi32_ps(emm0);
178
179 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
180 fx = _mm256_sub_ps(tmp, mask);
181
182 tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
183 bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
184 z = _mm256_mul_ps(bVal, bVal);
185
186 y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
187 y = _mm256_fmadd_ps(y, bVal, exp_p2);
188 y = _mm256_fmadd_ps(y, bVal, exp_p3);
189 y = _mm256_fmadd_ps(y, bVal, exp_p4);
190 y = _mm256_fmadd_ps(y, bVal, exp_p5);
191 y = _mm256_fmadd_ps(y, z, bVal);
192 y = _mm256_add_ps(y, one);
193
194 emm0 =
195 _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
196
197 pow2n = _mm256_castsi256_ps(emm0);
198 cVal = _mm256_mul_ps(y, pow2n);
199
200 _mm256_store_ps(cPtr, cVal);
201
202 aPtr += 8;
203 bPtr += 8;
204 cPtr += 8;
205 }
206
207 number = eighthPoints * 8;
208 for (; number < num_points; number++) {
209 *cPtr++ = pow(*aPtr++, *bPtr++);
210 }
211}
212
213#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
214
215#ifdef LV_HAVE_AVX2
216#include <immintrin.h>
217
218#define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
219#define POLY1_AVX2(x, c0, c1) \
220 _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
221#define POLY2_AVX2(x, c0, c1, c2) \
222 _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
223#define POLY3_AVX2(x, c0, c1, c2, c3) \
224 _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
225#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
226 _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
227#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
228 _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
229
230static inline void volk_32f_x2_pow_32f_a_avx2(float* cVector,
231 const float* bVector,
232 const float* aVector,
233 unsigned int num_points)
234{
235 float* cPtr = cVector;
236 const float* bPtr = bVector;
237 const float* aPtr = aVector;
238
239 unsigned int number = 0;
240 const unsigned int eighthPoints = num_points / 8;
241
242 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
243 __m256 tmp, fx, mask, pow2n, z, y;
244 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
245 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
246 __m256i bias, exp, emm0, pi32_0x7f;
247
248 one = _mm256_set1_ps(1.0);
249 exp_hi = _mm256_set1_ps(88.3762626647949);
250 exp_lo = _mm256_set1_ps(-88.3762626647949);
251 ln2 = _mm256_set1_ps(0.6931471805);
252 log2EF = _mm256_set1_ps(1.44269504088896341);
253 half = _mm256_set1_ps(0.5);
254 exp_C1 = _mm256_set1_ps(0.693359375);
255 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
256 pi32_0x7f = _mm256_set1_epi32(0x7f);
257
258 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
259 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
260 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
261 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
262 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
263 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
264
265 for (; number < eighthPoints; number++) {
266 // First compute the logarithm
267 aVal = _mm256_load_ps(aPtr);
268 bias = _mm256_set1_epi32(127);
269 leadingOne = _mm256_set1_ps(1.0f);
270 exp = _mm256_sub_epi32(
271 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
272 _mm256_set1_epi32(0x7f800000)),
273 23),
274 bias);
275 logarithm = _mm256_cvtepi32_ps(exp);
276
277 frac = _mm256_or_ps(
278 leadingOne,
279 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
280
281#if POW_POLY_DEGREE == 6
282 mantissa = POLY5_AVX2(frac,
283 3.1157899f,
284 -3.3241990f,
285 2.5988452f,
286 -1.2315303f,
287 3.1821337e-1f,
288 -3.4436006e-2f);
289#elif POW_POLY_DEGREE == 5
290 mantissa = POLY4_AVX2(frac,
291 2.8882704548164776201f,
292 -2.52074962577807006663f,
293 1.48116647521213171641f,
294 -0.465725644288844778798f,
295 0.0596515482674574969533f);
296#elif POW_POLY_DEGREE == 4
297 mantissa = POLY3_AVX2(frac,
298 2.61761038894603480148f,
299 -1.75647175389045657003f,
300 0.688243882994381274313f,
301 -0.107254423828329604454f);
302#elif POW_POLY_DEGREE == 3
303 mantissa = POLY2_AVX2(frac,
304 2.28330284476918490682f,
305 -1.04913055217340124191f,
306 0.204446009836232697516f);
307#else
308#error
309#endif
310
311 logarithm = _mm256_add_ps(
312 _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
313 logarithm = _mm256_mul_ps(logarithm, ln2);
314
315 // Now calculate b*lna
316 bVal = _mm256_load_ps(bPtr);
317 bVal = _mm256_mul_ps(bVal, logarithm);
318
319 // Now compute exp(b*lna)
320 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
321
322 fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
323
324 emm0 = _mm256_cvttps_epi32(fx);
325 tmp = _mm256_cvtepi32_ps(emm0);
326
327 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
328 fx = _mm256_sub_ps(tmp, mask);
329
330 tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
331 bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
332 z = _mm256_mul_ps(bVal, bVal);
333
334 y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
335 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
336 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
337 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
338 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
339 y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
340 y = _mm256_add_ps(y, one);
341
342 emm0 =
343 _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
344
345 pow2n = _mm256_castsi256_ps(emm0);
346 cVal = _mm256_mul_ps(y, pow2n);
347
348 _mm256_store_ps(cPtr, cVal);
349
350 aPtr += 8;
351 bPtr += 8;
352 cPtr += 8;
353 }
354
355 number = eighthPoints * 8;
356 for (; number < num_points; number++) {
357 *cPtr++ = pow(*aPtr++, *bPtr++);
358 }
359}
360
361#endif /* LV_HAVE_AVX2 for aligned */
362
363
364#ifdef LV_HAVE_SSE4_1
365#include <smmintrin.h>
366
367#define POLY0(x, c0) _mm_set1_ps(c0)
368#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
369#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
370#define POLY3(x, c0, c1, c2, c3) \
371 _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
372#define POLY4(x, c0, c1, c2, c3, c4) \
373 _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
374#define POLY5(x, c0, c1, c2, c3, c4, c5) \
375 _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
376
377static inline void volk_32f_x2_pow_32f_a_sse4_1(float* cVector,
378 const float* bVector,
379 const float* aVector,
380 unsigned int num_points)
381{
382 float* cPtr = cVector;
383 const float* bPtr = bVector;
384 const float* aPtr = aVector;
385
386 unsigned int number = 0;
387 const unsigned int quarterPoints = num_points / 4;
388
389 __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
390 __m128 tmp, fx, mask, pow2n, z, y;
391 __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
392 __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
393 __m128i bias, exp, emm0, pi32_0x7f;
394
395 one = _mm_set1_ps(1.0);
396 exp_hi = _mm_set1_ps(88.3762626647949);
397 exp_lo = _mm_set1_ps(-88.3762626647949);
398 ln2 = _mm_set1_ps(0.6931471805);
399 log2EF = _mm_set1_ps(1.44269504088896341);
400 half = _mm_set1_ps(0.5);
401 exp_C1 = _mm_set1_ps(0.693359375);
402 exp_C2 = _mm_set1_ps(-2.12194440e-4);
403 pi32_0x7f = _mm_set1_epi32(0x7f);
404
405 exp_p0 = _mm_set1_ps(1.9875691500e-4);
406 exp_p1 = _mm_set1_ps(1.3981999507e-3);
407 exp_p2 = _mm_set1_ps(8.3334519073e-3);
408 exp_p3 = _mm_set1_ps(4.1665795894e-2);
409 exp_p4 = _mm_set1_ps(1.6666665459e-1);
410 exp_p5 = _mm_set1_ps(5.0000001201e-1);
411
412 for (; number < quarterPoints; number++) {
413 // First compute the logarithm
414 aVal = _mm_load_ps(aPtr);
415 bias = _mm_set1_epi32(127);
416 leadingOne = _mm_set1_ps(1.0f);
417 exp = _mm_sub_epi32(
419 _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
420 bias);
421 logarithm = _mm_cvtepi32_ps(exp);
422
423 frac = _mm_or_ps(leadingOne,
424 _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
425
426#if POW_POLY_DEGREE == 6
427 mantissa = POLY5(frac,
428 3.1157899f,
429 -3.3241990f,
430 2.5988452f,
431 -1.2315303f,
432 3.1821337e-1f,
433 -3.4436006e-2f);
434#elif POW_POLY_DEGREE == 5
435 mantissa = POLY4(frac,
436 2.8882704548164776201f,
437 -2.52074962577807006663f,
438 1.48116647521213171641f,
439 -0.465725644288844778798f,
440 0.0596515482674574969533f);
441#elif POW_POLY_DEGREE == 4
442 mantissa = POLY3(frac,
443 2.61761038894603480148f,
444 -1.75647175389045657003f,
445 0.688243882994381274313f,
446 -0.107254423828329604454f);
447#elif POW_POLY_DEGREE == 3
448 mantissa = POLY2(frac,
449 2.28330284476918490682f,
450 -1.04913055217340124191f,
451 0.204446009836232697516f);
452#else
453#error
454#endif
455
456 logarithm =
457 _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
458 logarithm = _mm_mul_ps(logarithm, ln2);
459
460
461 // Now calculate b*lna
462 bVal = _mm_load_ps(bPtr);
463 bVal = _mm_mul_ps(bVal, logarithm);
464
465 // Now compute exp(b*lna)
466 bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
467
468 fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
469
470 emm0 = _mm_cvttps_epi32(fx);
471 tmp = _mm_cvtepi32_ps(emm0);
472
473 mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
474 fx = _mm_sub_ps(tmp, mask);
475
476 tmp = _mm_mul_ps(fx, exp_C1);
477 z = _mm_mul_ps(fx, exp_C2);
478 bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
479 z = _mm_mul_ps(bVal, bVal);
480
481 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
482 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
483 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
484 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
485 y = _mm_add_ps(y, one);
486
487 emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
488
489 pow2n = _mm_castsi128_ps(emm0);
490 cVal = _mm_mul_ps(y, pow2n);
491
492 _mm_store_ps(cPtr, cVal);
493
494 aPtr += 4;
495 bPtr += 4;
496 cPtr += 4;
497 }
498
499 number = quarterPoints * 4;
500 for (; number < num_points; number++) {
501 *cPtr++ = powf(*aPtr++, *bPtr++);
502 }
503}
504
505#endif /* LV_HAVE_SSE4_1 for aligned */
506
507#endif /* INCLUDED_volk_32f_x2_pow_32f_a_H */
508
509#ifndef INCLUDED_volk_32f_x2_pow_32f_u_H
510#define INCLUDED_volk_32f_x2_pow_32f_u_H
511
512#include <inttypes.h>
513#include <math.h>
514#include <stdio.h>
515#include <stdlib.h>
516
517#define POW_POLY_DEGREE 3
518
519#ifdef LV_HAVE_GENERIC
520
521static inline void volk_32f_x2_pow_32f_generic(float* cVector,
522 const float* bVector,
523 const float* aVector,
524 unsigned int num_points)
525{
526 float* cPtr = cVector;
527 const float* bPtr = bVector;
528 const float* aPtr = aVector;
529 unsigned int number = 0;
530
531 for (number = 0; number < num_points; number++) {
532 *cPtr++ = powf(*aPtr++, *bPtr++);
533 }
534}
535#endif /* LV_HAVE_GENERIC */
536
537
538#ifdef LV_HAVE_SSE4_1
539#include <smmintrin.h>
540
541#define POLY0(x, c0) _mm_set1_ps(c0)
542#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
543#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
544#define POLY3(x, c0, c1, c2, c3) \
545 _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
546#define POLY4(x, c0, c1, c2, c3, c4) \
547 _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
548#define POLY5(x, c0, c1, c2, c3, c4, c5) \
549 _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
550
551static inline void volk_32f_x2_pow_32f_u_sse4_1(float* cVector,
552 const float* bVector,
553 const float* aVector,
554 unsigned int num_points)
555{
556 float* cPtr = cVector;
557 const float* bPtr = bVector;
558 const float* aPtr = aVector;
559
560 unsigned int number = 0;
561 const unsigned int quarterPoints = num_points / 4;
562
563 __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
564 __m128 tmp, fx, mask, pow2n, z, y;
565 __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
566 __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
567 __m128i bias, exp, emm0, pi32_0x7f;
568
569 one = _mm_set1_ps(1.0);
570 exp_hi = _mm_set1_ps(88.3762626647949);
571 exp_lo = _mm_set1_ps(-88.3762626647949);
572 ln2 = _mm_set1_ps(0.6931471805);
573 log2EF = _mm_set1_ps(1.44269504088896341);
574 half = _mm_set1_ps(0.5);
575 exp_C1 = _mm_set1_ps(0.693359375);
576 exp_C2 = _mm_set1_ps(-2.12194440e-4);
577 pi32_0x7f = _mm_set1_epi32(0x7f);
578
579 exp_p0 = _mm_set1_ps(1.9875691500e-4);
580 exp_p1 = _mm_set1_ps(1.3981999507e-3);
581 exp_p2 = _mm_set1_ps(8.3334519073e-3);
582 exp_p3 = _mm_set1_ps(4.1665795894e-2);
583 exp_p4 = _mm_set1_ps(1.6666665459e-1);
584 exp_p5 = _mm_set1_ps(5.0000001201e-1);
585
586 for (; number < quarterPoints; number++) {
587 // First compute the logarithm
588 aVal = _mm_loadu_ps(aPtr);
589 bias = _mm_set1_epi32(127);
590 leadingOne = _mm_set1_ps(1.0f);
591 exp = _mm_sub_epi32(
593 _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
594 bias);
595 logarithm = _mm_cvtepi32_ps(exp);
596
597 frac = _mm_or_ps(leadingOne,
598 _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
599
600#if POW_POLY_DEGREE == 6
601 mantissa = POLY5(frac,
602 3.1157899f,
603 -3.3241990f,
604 2.5988452f,
605 -1.2315303f,
606 3.1821337e-1f,
607 -3.4436006e-2f);
608#elif POW_POLY_DEGREE == 5
609 mantissa = POLY4(frac,
610 2.8882704548164776201f,
611 -2.52074962577807006663f,
612 1.48116647521213171641f,
613 -0.465725644288844778798f,
614 0.0596515482674574969533f);
615#elif POW_POLY_DEGREE == 4
616 mantissa = POLY3(frac,
617 2.61761038894603480148f,
618 -1.75647175389045657003f,
619 0.688243882994381274313f,
620 -0.107254423828329604454f);
621#elif POW_POLY_DEGREE == 3
622 mantissa = POLY2(frac,
623 2.28330284476918490682f,
624 -1.04913055217340124191f,
625 0.204446009836232697516f);
626#else
627#error
628#endif
629
630 logarithm =
631 _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
632 logarithm = _mm_mul_ps(logarithm, ln2);
633
634
635 // Now calculate b*lna
636 bVal = _mm_loadu_ps(bPtr);
637 bVal = _mm_mul_ps(bVal, logarithm);
638
639 // Now compute exp(b*lna)
640 bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
641
642 fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
643
644 emm0 = _mm_cvttps_epi32(fx);
645 tmp = _mm_cvtepi32_ps(emm0);
646
647 mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
648 fx = _mm_sub_ps(tmp, mask);
649
650 tmp = _mm_mul_ps(fx, exp_C1);
651 z = _mm_mul_ps(fx, exp_C2);
652 bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
653 z = _mm_mul_ps(bVal, bVal);
654
655 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
656 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
657 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
658 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
659 y = _mm_add_ps(y, one);
660
661 emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
662
663 pow2n = _mm_castsi128_ps(emm0);
664 cVal = _mm_mul_ps(y, pow2n);
665
666 _mm_storeu_ps(cPtr, cVal);
667
668 aPtr += 4;
669 bPtr += 4;
670 cPtr += 4;
671 }
672
673 number = quarterPoints * 4;
674 for (; number < num_points; number++) {
675 *cPtr++ = powf(*aPtr++, *bPtr++);
676 }
677}
678
679#endif /* LV_HAVE_SSE4_1 for unaligned */
680
681#if LV_HAVE_AVX2 && LV_HAVE_FMA
682#include <immintrin.h>
683
684#define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)
685#define POLY1_AVX2_FMA(x, c0, c1) \
686 _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
687#define POLY2_AVX2_FMA(x, c0, c1, c2) \
688 _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
689#define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \
690 _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
691#define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \
692 _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
693#define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \
694 _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
695
696static inline void volk_32f_x2_pow_32f_u_avx2_fma(float* cVector,
697 const float* bVector,
698 const float* aVector,
699 unsigned int num_points)
700{
701 float* cPtr = cVector;
702 const float* bPtr = bVector;
703 const float* aPtr = aVector;
704
705 unsigned int number = 0;
706 const unsigned int eighthPoints = num_points / 8;
707
708 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
709 __m256 tmp, fx, mask, pow2n, z, y;
710 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
711 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
712 __m256i bias, exp, emm0, pi32_0x7f;
713
714 one = _mm256_set1_ps(1.0);
715 exp_hi = _mm256_set1_ps(88.3762626647949);
716 exp_lo = _mm256_set1_ps(-88.3762626647949);
717 ln2 = _mm256_set1_ps(0.6931471805);
718 log2EF = _mm256_set1_ps(1.44269504088896341);
719 half = _mm256_set1_ps(0.5);
720 exp_C1 = _mm256_set1_ps(0.693359375);
721 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
722 pi32_0x7f = _mm256_set1_epi32(0x7f);
723
724 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
725 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
726 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
727 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
728 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
729 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
730
731 for (; number < eighthPoints; number++) {
732 // First compute the logarithm
733 aVal = _mm256_loadu_ps(aPtr);
734 bias = _mm256_set1_epi32(127);
735 leadingOne = _mm256_set1_ps(1.0f);
736 exp = _mm256_sub_epi32(
737 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
738 _mm256_set1_epi32(0x7f800000)),
739 23),
740 bias);
741 logarithm = _mm256_cvtepi32_ps(exp);
742
743 frac = _mm256_or_ps(
744 leadingOne,
745 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
746
747#if POW_POLY_DEGREE == 6
748 mantissa = POLY5_AVX2_FMA(frac,
749 3.1157899f,
750 -3.3241990f,
751 2.5988452f,
752 -1.2315303f,
753 3.1821337e-1f,
754 -3.4436006e-2f);
755#elif POW_POLY_DEGREE == 5
756 mantissa = POLY4_AVX2_FMA(frac,
757 2.8882704548164776201f,
758 -2.52074962577807006663f,
759 1.48116647521213171641f,
760 -0.465725644288844778798f,
761 0.0596515482674574969533f);
762#elif POW_POLY_DEGREE == 4
763 mantissa = POLY3_AVX2_FMA(frac,
764 2.61761038894603480148f,
765 -1.75647175389045657003f,
766 0.688243882994381274313f,
767 -0.107254423828329604454f);
768#elif POW_POLY_DEGREE == 3
769 mantissa = POLY2_AVX2_FMA(frac,
770 2.28330284476918490682f,
771 -1.04913055217340124191f,
772 0.204446009836232697516f);
773#else
774#error
775#endif
776
777 logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
778 logarithm = _mm256_mul_ps(logarithm, ln2);
779
780
781 // Now calculate b*lna
782 bVal = _mm256_loadu_ps(bPtr);
783 bVal = _mm256_mul_ps(bVal, logarithm);
784
785 // Now compute exp(b*lna)
786 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
787
788 fx = _mm256_fmadd_ps(bVal, log2EF, half);
789
790 emm0 = _mm256_cvttps_epi32(fx);
791 tmp = _mm256_cvtepi32_ps(emm0);
792
793 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
794 fx = _mm256_sub_ps(tmp, mask);
795
796 tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
797 bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
798 z = _mm256_mul_ps(bVal, bVal);
799
800 y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
801 y = _mm256_fmadd_ps(y, bVal, exp_p2);
802 y = _mm256_fmadd_ps(y, bVal, exp_p3);
803 y = _mm256_fmadd_ps(y, bVal, exp_p4);
804 y = _mm256_fmadd_ps(y, bVal, exp_p5);
805 y = _mm256_fmadd_ps(y, z, bVal);
806 y = _mm256_add_ps(y, one);
807
808 emm0 =
809 _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
810
811 pow2n = _mm256_castsi256_ps(emm0);
812 cVal = _mm256_mul_ps(y, pow2n);
813
814 _mm256_storeu_ps(cPtr, cVal);
815
816 aPtr += 8;
817 bPtr += 8;
818 cPtr += 8;
819 }
820
821 number = eighthPoints * 8;
822 for (; number < num_points; number++) {
823 *cPtr++ = pow(*aPtr++, *bPtr++);
824 }
825}
826
827#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
828
829#ifdef LV_HAVE_AVX2
830#include <immintrin.h>
831
832#define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
833#define POLY1_AVX2(x, c0, c1) \
834 _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
835#define POLY2_AVX2(x, c0, c1, c2) \
836 _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
837#define POLY3_AVX2(x, c0, c1, c2, c3) \
838 _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
839#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
840 _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
841#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
842 _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
843
844static inline void volk_32f_x2_pow_32f_u_avx2(float* cVector,
845 const float* bVector,
846 const float* aVector,
847 unsigned int num_points)
848{
849 float* cPtr = cVector;
850 const float* bPtr = bVector;
851 const float* aPtr = aVector;
852
853 unsigned int number = 0;
854 const unsigned int eighthPoints = num_points / 8;
855
856 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
857 __m256 tmp, fx, mask, pow2n, z, y;
858 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
859 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
860 __m256i bias, exp, emm0, pi32_0x7f;
861
862 one = _mm256_set1_ps(1.0);
863 exp_hi = _mm256_set1_ps(88.3762626647949);
864 exp_lo = _mm256_set1_ps(-88.3762626647949);
865 ln2 = _mm256_set1_ps(0.6931471805);
866 log2EF = _mm256_set1_ps(1.44269504088896341);
867 half = _mm256_set1_ps(0.5);
868 exp_C1 = _mm256_set1_ps(0.693359375);
869 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
870 pi32_0x7f = _mm256_set1_epi32(0x7f);
871
872 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
873 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
874 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
875 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
876 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
877 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
878
879 for (; number < eighthPoints; number++) {
880 // First compute the logarithm
881 aVal = _mm256_loadu_ps(aPtr);
882 bias = _mm256_set1_epi32(127);
883 leadingOne = _mm256_set1_ps(1.0f);
884 exp = _mm256_sub_epi32(
885 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
886 _mm256_set1_epi32(0x7f800000)),
887 23),
888 bias);
889 logarithm = _mm256_cvtepi32_ps(exp);
890
891 frac = _mm256_or_ps(
892 leadingOne,
893 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
894
895#if POW_POLY_DEGREE == 6
896 mantissa = POLY5_AVX2(frac,
897 3.1157899f,
898 -3.3241990f,
899 2.5988452f,
900 -1.2315303f,
901 3.1821337e-1f,
902 -3.4436006e-2f);
903#elif POW_POLY_DEGREE == 5
904 mantissa = POLY4_AVX2(frac,
905 2.8882704548164776201f,
906 -2.52074962577807006663f,
907 1.48116647521213171641f,
908 -0.465725644288844778798f,
909 0.0596515482674574969533f);
910#elif POW_POLY_DEGREE == 4
911 mantissa = POLY3_AVX2(frac,
912 2.61761038894603480148f,
913 -1.75647175389045657003f,
914 0.688243882994381274313f,
915 -0.107254423828329604454f);
916#elif POW_POLY_DEGREE == 3
917 mantissa = POLY2_AVX2(frac,
918 2.28330284476918490682f,
919 -1.04913055217340124191f,
920 0.204446009836232697516f);
921#else
922#error
923#endif
924
925 logarithm = _mm256_add_ps(
926 _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
927 logarithm = _mm256_mul_ps(logarithm, ln2);
928
929 // Now calculate b*lna
930 bVal = _mm256_loadu_ps(bPtr);
931 bVal = _mm256_mul_ps(bVal, logarithm);
932
933 // Now compute exp(b*lna)
934 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
935
936 fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
937
938 emm0 = _mm256_cvttps_epi32(fx);
939 tmp = _mm256_cvtepi32_ps(emm0);
940
941 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
942 fx = _mm256_sub_ps(tmp, mask);
943
944 tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
945 bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
946 z = _mm256_mul_ps(bVal, bVal);
947
948 y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
949 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
950 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
951 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
952 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
953 y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
954 y = _mm256_add_ps(y, one);
955
956 emm0 =
957 _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
958
959 pow2n = _mm256_castsi256_ps(emm0);
960 cVal = _mm256_mul_ps(y, pow2n);
961
962 _mm256_storeu_ps(cPtr, cVal);
963
964 aPtr += 8;
965 bPtr += 8;
966 cPtr += 8;
967 }
968
969 number = eighthPoints * 8;
970 for (; number < num_points; number++) {
971 *cPtr++ = pow(*aPtr++, *bPtr++);
972 }
973}
974
975#endif /* LV_HAVE_AVX2 for unaligned */
976
977#endif /* INCLUDED_volk_32f_x2_log2_32f_u_H */
FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
Definition: sse2neon.h:5565
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2834
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:2984
#define _mm_srli_epi32(a, imm)
Definition: sse2neon.h:5838
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128
FORCE_INLINE __m128i _mm_set1_epi32(int)
Definition: sse2neon.h:5212
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
Definition: sse2neon.h:4324
FORCE_INLINE __m128 _mm_set1_ps(float _w)
Definition: sse2neon.h:2503
FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1154
FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:6087
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1064
FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
Definition: sse2neon.h:3250
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128i _mm_castps_si128(__m128)
Definition: sse2neon.h:3230
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2080
FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
Definition: sse2neon.h:3937
FORCE_INLINE __m128 _mm_or_ps(__m128, __m128)
Definition: sse2neon.h:2237
FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2025
static void volk_32f_x2_pow_32f_generic(float *cVector, const float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_x2_pow_32f.h:521