Vector Optimized Library of Kernels  3.2.0
Architecture-tuned implementations of math kernels
volk_32f_sin_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
59 #include <inttypes.h>
60 #include <math.h>
61 #include <stdio.h>
62 
63 #ifndef INCLUDED_volk_32f_sin_32f_a_H
64 #define INCLUDED_volk_32f_sin_32f_a_H
65 #ifdef LV_HAVE_AVX512F
66 
67 #include <immintrin.h>
68 static inline void volk_32f_sin_32f_a_avx512f(float* sinVector,
69  const float* inVector,
70  unsigned int num_points)
71 {
72  float* sinPtr = sinVector;
73  const float* inPtr = inVector;
74 
75  unsigned int number = 0;
76  unsigned int sixteenPoints = num_points / 16;
77  unsigned int i = 0;
78 
79  __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
80  fones;
81  __m512 sine, cosine;
82  __m512i q, zeros, ones, twos, fours;
83 
84  m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125);
85  pio4A = _mm512_set1_ps(0.7853981554508209228515625);
86  pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8);
87  pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
88  ffours = _mm512_set1_ps(4.0);
89  ftwos = _mm512_set1_ps(2.0);
90  fones = _mm512_set1_ps(1.0);
91  zeros = _mm512_setzero_epi32();
92  ones = _mm512_set1_epi32(1);
93  twos = _mm512_set1_epi32(2);
94  fours = _mm512_set1_epi32(4);
95 
96  cp1 = _mm512_set1_ps(1.0);
97  cp2 = _mm512_set1_ps(0.08333333333333333);
98  cp3 = _mm512_set1_ps(0.002777777777777778);
99  cp4 = _mm512_set1_ps(4.96031746031746e-05);
100  cp5 = _mm512_set1_ps(5.511463844797178e-07);
101  __mmask16 condition1, condition2, ltZero;
102 
103  for (; number < sixteenPoints; number++) {
104  aVal = _mm512_load_ps(inPtr);
105  // s = fabs(aVal)
106  s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff)));
107 
108  // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
109  q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi)));
110  // r = q + q&1, q indicates quadrant, r gives
111  r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones)));
112 
113  s = _mm512_fnmadd_ps(r, pio4A, s);
114  s = _mm512_fnmadd_ps(r, pio4B, s);
115  s = _mm512_fnmadd_ps(r, pio4C, s);
116 
117  s = _mm512_div_ps(
118  s,
119  _mm512_set1_ps(8.0f)); // The constant is 2^N, for 3 times argument reduction
120  s = _mm512_mul_ps(s, s);
121  // Evaluate Taylor series
122  s = _mm512_mul_ps(
123  _mm512_fmadd_ps(
124  _mm512_fmsub_ps(
125  _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
126  s,
127  cp1),
128  s);
129 
130  for (i = 0; i < 3; i++) {
131  s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
132  }
133  s = _mm512_div_ps(s, ftwos);
134 
135  sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
136  cosine = _mm512_sub_ps(fones, s);
137 
138  condition1 = _mm512_cmpneq_epi32_mask(
139  _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros);
140  ltZero = _mm512_cmp_ps_mask(aVal, _mm512_setzero_ps(), _CMP_LT_OS);
141  condition2 = _mm512_kxor(
142  _mm512_cmpneq_epi32_mask(_mm512_and_epi32(q, fours), zeros), ltZero);
143 
144  sine = _mm512_mask_blend_ps(condition1, sine, cosine);
145  sine = _mm512_mask_mul_ps(sine, condition2, sine, _mm512_set1_ps(-1.f));
146  _mm512_store_ps(sinPtr, sine);
147  inPtr += 16;
148  sinPtr += 16;
149  }
150 
151  number = sixteenPoints * 16;
152  for (; number < num_points; number++) {
153  *sinPtr++ = sinf(*inPtr++);
154  }
155 }
156 #endif
157 #if LV_HAVE_AVX2 && LV_HAVE_FMA
158 #include <immintrin.h>
159 
160 static inline void
161 volk_32f_sin_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
162 {
163  float* bPtr = bVector;
164  const float* aPtr = aVector;
165 
166  unsigned int number = 0;
167  unsigned int eighthPoints = num_points / 8;
168  unsigned int i = 0;
169 
170  __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
171  fzeroes;
172  __m256 sine, cosine, condition1, condition2;
173  __m256i q, r, ones, twos, fours;
174 
175  m4pi = _mm256_set1_ps(1.273239545);
176  pio4A = _mm256_set1_ps(0.78515625);
177  pio4B = _mm256_set1_ps(0.241876e-3);
178  ffours = _mm256_set1_ps(4.0);
179  ftwos = _mm256_set1_ps(2.0);
180  fones = _mm256_set1_ps(1.0);
181  fzeroes = _mm256_setzero_ps();
182  ones = _mm256_set1_epi32(1);
183  twos = _mm256_set1_epi32(2);
184  fours = _mm256_set1_epi32(4);
185 
186  cp1 = _mm256_set1_ps(1.0);
187  cp2 = _mm256_set1_ps(0.83333333e-1);
188  cp3 = _mm256_set1_ps(0.2777778e-2);
189  cp4 = _mm256_set1_ps(0.49603e-4);
190  cp5 = _mm256_set1_ps(0.551e-6);
191 
192  for (; number < eighthPoints; number++) {
193  aVal = _mm256_load_ps(aPtr);
194  s = _mm256_sub_ps(aVal,
195  _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
196  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
197  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
198  r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
199 
200  s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
201  s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
202 
203  s = _mm256_div_ps(
204  s,
205  _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
206  s = _mm256_mul_ps(s, s);
207  // Evaluate Taylor series
208  s = _mm256_mul_ps(
209  _mm256_fmadd_ps(
210  _mm256_fmsub_ps(
211  _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
212  s,
213  cp1),
214  s);
215 
216  for (i = 0; i < 3; i++) {
217  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
218  }
219  s = _mm256_div_ps(s, ftwos);
220 
221  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
222  cosine = _mm256_sub_ps(fones, s);
223 
224  condition1 = _mm256_cmp_ps(
225  _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
226  fzeroes,
227  _CMP_NEQ_UQ);
228  condition2 = _mm256_cmp_ps(
229  _mm256_cmp_ps(
230  _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
231  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
232  _CMP_NEQ_UQ);
233  // Need this condition only for cos
234  // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
235  // twos), fours)), fzeroes);
236 
237  sine =
238  _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
239  sine = _mm256_sub_ps(
240  sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
241  _mm256_store_ps(bPtr, sine);
242  aPtr += 8;
243  bPtr += 8;
244  }
245 
246  number = eighthPoints * 8;
247  for (; number < num_points; number++) {
248  *bPtr++ = sin(*aPtr++);
249  }
250 }
251 
252 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
253 
254 #ifdef LV_HAVE_AVX2
255 #include <immintrin.h>
256 
257 static inline void
258 volk_32f_sin_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
259 {
260  float* bPtr = bVector;
261  const float* aPtr = aVector;
262 
263  unsigned int number = 0;
264  unsigned int eighthPoints = num_points / 8;
265  unsigned int i = 0;
266 
267  __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
268  fzeroes;
269  __m256 sine, cosine, condition1, condition2;
270  __m256i q, r, ones, twos, fours;
271 
272  m4pi = _mm256_set1_ps(1.273239545);
273  pio4A = _mm256_set1_ps(0.78515625);
274  pio4B = _mm256_set1_ps(0.241876e-3);
275  ffours = _mm256_set1_ps(4.0);
276  ftwos = _mm256_set1_ps(2.0);
277  fones = _mm256_set1_ps(1.0);
278  fzeroes = _mm256_setzero_ps();
279  ones = _mm256_set1_epi32(1);
280  twos = _mm256_set1_epi32(2);
281  fours = _mm256_set1_epi32(4);
282 
283  cp1 = _mm256_set1_ps(1.0);
284  cp2 = _mm256_set1_ps(0.83333333e-1);
285  cp3 = _mm256_set1_ps(0.2777778e-2);
286  cp4 = _mm256_set1_ps(0.49603e-4);
287  cp5 = _mm256_set1_ps(0.551e-6);
288 
289  for (; number < eighthPoints; number++) {
290  aVal = _mm256_load_ps(aPtr);
291  s = _mm256_sub_ps(aVal,
292  _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
293  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
294  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
295  r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
296 
297  s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
298  s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
299 
300  s = _mm256_div_ps(
301  s,
302  _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
303  s = _mm256_mul_ps(s, s);
304  // Evaluate Taylor series
305  s = _mm256_mul_ps(
306  _mm256_add_ps(
307  _mm256_mul_ps(
308  _mm256_sub_ps(
309  _mm256_mul_ps(
310  _mm256_add_ps(
311  _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
312  s),
313  cp3),
314  s),
315  cp2),
316  s),
317  cp1),
318  s);
319 
320  for (i = 0; i < 3; i++) {
321  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
322  }
323  s = _mm256_div_ps(s, ftwos);
324 
325  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
326  cosine = _mm256_sub_ps(fones, s);
327 
328  condition1 = _mm256_cmp_ps(
329  _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
330  fzeroes,
331  _CMP_NEQ_UQ);
332  condition2 = _mm256_cmp_ps(
333  _mm256_cmp_ps(
334  _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
335  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
336  _CMP_NEQ_UQ);
337  // Need this condition only for cos
338  // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
339  // twos), fours)), fzeroes);
340 
341  sine =
342  _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
343  sine = _mm256_sub_ps(
344  sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
345  _mm256_store_ps(bPtr, sine);
346  aPtr += 8;
347  bPtr += 8;
348  }
349 
350  number = eighthPoints * 8;
351  for (; number < num_points; number++) {
352  *bPtr++ = sin(*aPtr++);
353  }
354 }
355 
356 #endif /* LV_HAVE_AVX2 for aligned */
357 
358 #ifdef LV_HAVE_SSE4_1
359 #include <smmintrin.h>
360 
361 static inline void
362 volk_32f_sin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
363 {
364  float* bPtr = bVector;
365  const float* aPtr = aVector;
366 
367  unsigned int number = 0;
368  unsigned int quarterPoints = num_points / 4;
369  unsigned int i = 0;
370 
371  __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
372  fzeroes;
373  __m128 sine, cosine, condition1, condition2;
374  __m128i q, r, ones, twos, fours;
375 
376  m4pi = _mm_set1_ps(1.273239545);
377  pio4A = _mm_set1_ps(0.78515625);
378  pio4B = _mm_set1_ps(0.241876e-3);
379  ffours = _mm_set1_ps(4.0);
380  ftwos = _mm_set1_ps(2.0);
381  fones = _mm_set1_ps(1.0);
382  fzeroes = _mm_setzero_ps();
383  ones = _mm_set1_epi32(1);
384  twos = _mm_set1_epi32(2);
385  fours = _mm_set1_epi32(4);
386 
387  cp1 = _mm_set1_ps(1.0);
388  cp2 = _mm_set1_ps(0.83333333e-1);
389  cp3 = _mm_set1_ps(0.2777778e-2);
390  cp4 = _mm_set1_ps(0.49603e-4);
391  cp5 = _mm_set1_ps(0.551e-6);
392 
393  for (; number < quarterPoints; number++) {
394  aVal = _mm_load_ps(aPtr);
395  s = _mm_sub_ps(aVal,
396  _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
397  q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
398  r = _mm_add_epi32(q, _mm_and_si128(q, ones));
399 
400  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
401  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
402 
403  s = _mm_div_ps(
404  s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
405  s = _mm_mul_ps(s, s);
406  // Evaluate Taylor series
407  s = _mm_mul_ps(
408  _mm_add_ps(
409  _mm_mul_ps(
410  _mm_sub_ps(
411  _mm_mul_ps(
412  _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
413  cp3),
414  s),
415  cp2),
416  s),
417  cp1),
418  s);
419 
420  for (i = 0; i < 3; i++) {
421  s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
422  }
423  s = _mm_div_ps(s, ftwos);
424 
425  sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
426  cosine = _mm_sub_ps(fones, s);
427 
428  condition1 = _mm_cmpneq_ps(
429  _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
430  condition2 = _mm_cmpneq_ps(
431  _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
432  _mm_cmplt_ps(aVal, fzeroes));
433  // Need this condition only for cos
434  // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
435  // twos), fours)), fzeroes);
436 
437  sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
438  sine =
439  _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
440  _mm_store_ps(bPtr, sine);
441  aPtr += 4;
442  bPtr += 4;
443  }
444 
445  number = quarterPoints * 4;
446  for (; number < num_points; number++) {
447  *bPtr++ = sinf(*aPtr++);
448  }
449 }
450 
451 #endif /* LV_HAVE_SSE4_1 for aligned */
452 
453 
454 #endif /* INCLUDED_volk_32f_sin_32f_a_H */
455 
456 #ifndef INCLUDED_volk_32f_sin_32f_u_H
457 #define INCLUDED_volk_32f_sin_32f_u_H
458 
459 #ifdef LV_HAVE_AVX512F
460 
461 #include <immintrin.h>
462 static inline void volk_32f_sin_32f_u_avx512f(float* sinVector,
463  const float* inVector,
464  unsigned int num_points)
465 {
466  float* sinPtr = sinVector;
467  const float* inPtr = inVector;
468 
469  unsigned int number = 0;
470  unsigned int sixteenPoints = num_points / 16;
471  unsigned int i = 0;
472 
473  __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
474  fones;
475  __m512 sine, cosine;
476  __m512i q, zeros, ones, twos, fours;
477 
478  m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125);
479  pio4A = _mm512_set1_ps(0.7853981554508209228515625);
480  pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8);
481  pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
482  ffours = _mm512_set1_ps(4.0);
483  ftwos = _mm512_set1_ps(2.0);
484  fones = _mm512_set1_ps(1.0);
485  zeros = _mm512_setzero_epi32();
486  ones = _mm512_set1_epi32(1);
487  twos = _mm512_set1_epi32(2);
488  fours = _mm512_set1_epi32(4);
489 
490  cp1 = _mm512_set1_ps(1.0);
491  cp2 = _mm512_set1_ps(0.08333333333333333);
492  cp3 = _mm512_set1_ps(0.002777777777777778);
493  cp4 = _mm512_set1_ps(4.96031746031746e-05);
494  cp5 = _mm512_set1_ps(5.511463844797178e-07);
495  __mmask16 condition1, condition2, ltZero;
496 
497  for (; number < sixteenPoints; number++) {
498  aVal = _mm512_loadu_ps(inPtr);
499  // s = fabs(aVal)
500  s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff)));
501 
502  // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
503  q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi)));
504  // r = q + q&1, q indicates quadrant, r gives
505  r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones)));
506 
507  s = _mm512_fnmadd_ps(r, pio4A, s);
508  s = _mm512_fnmadd_ps(r, pio4B, s);
509  s = _mm512_fnmadd_ps(r, pio4C, s);
510 
511  s = _mm512_div_ps(
512  s,
513  _mm512_set1_ps(8.0f)); // The constant is 2^N, for 3 times argument reduction
514  s = _mm512_mul_ps(s, s);
515  // Evaluate Taylor series
516  s = _mm512_mul_ps(
517  _mm512_fmadd_ps(
518  _mm512_fmsub_ps(
519  _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
520  s,
521  cp1),
522  s);
523 
524  for (i = 0; i < 3; i++) {
525  s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
526  }
527  s = _mm512_div_ps(s, ftwos);
528 
529  sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
530  cosine = _mm512_sub_ps(fones, s);
531 
532  condition1 = _mm512_cmpneq_epi32_mask(
533  _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros);
534  ltZero = _mm512_cmp_ps_mask(aVal, _mm512_setzero_ps(), _CMP_LT_OS);
535  condition2 = _mm512_kxor(
536  _mm512_cmpneq_epi32_mask(_mm512_and_epi32(q, fours), zeros), ltZero);
537 
538  sine = _mm512_mask_blend_ps(condition1, sine, cosine);
539  sine = _mm512_mask_mul_ps(sine, condition2, sine, _mm512_set1_ps(-1.f));
540  _mm512_storeu_ps(sinPtr, sine);
541  inPtr += 16;
542  sinPtr += 16;
543  }
544 
545  number = sixteenPoints * 16;
546  for (; number < num_points; number++) {
547  *sinPtr++ = sinf(*inPtr++);
548  }
549 }
550 #endif
551 
552 #if LV_HAVE_AVX2 && LV_HAVE_FMA
553 #include <immintrin.h>
554 
555 static inline void
556 volk_32f_sin_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
557 {
558  float* bPtr = bVector;
559  const float* aPtr = aVector;
560 
561  unsigned int number = 0;
562  unsigned int eighthPoints = num_points / 8;
563  unsigned int i = 0;
564 
565  __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
566  fzeroes;
567  __m256 sine, cosine, condition1, condition2;
568  __m256i q, r, ones, twos, fours;
569 
570  m4pi = _mm256_set1_ps(1.273239545);
571  pio4A = _mm256_set1_ps(0.78515625);
572  pio4B = _mm256_set1_ps(0.241876e-3);
573  ffours = _mm256_set1_ps(4.0);
574  ftwos = _mm256_set1_ps(2.0);
575  fones = _mm256_set1_ps(1.0);
576  fzeroes = _mm256_setzero_ps();
577  ones = _mm256_set1_epi32(1);
578  twos = _mm256_set1_epi32(2);
579  fours = _mm256_set1_epi32(4);
580 
581  cp1 = _mm256_set1_ps(1.0);
582  cp2 = _mm256_set1_ps(0.83333333e-1);
583  cp3 = _mm256_set1_ps(0.2777778e-2);
584  cp4 = _mm256_set1_ps(0.49603e-4);
585  cp5 = _mm256_set1_ps(0.551e-6);
586 
587  for (; number < eighthPoints; number++) {
588  aVal = _mm256_loadu_ps(aPtr);
589  s = _mm256_sub_ps(aVal,
590  _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
591  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
592  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
593  r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
594 
595  s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
596  s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
597 
598  s = _mm256_div_ps(
599  s,
600  _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
601  s = _mm256_mul_ps(s, s);
602  // Evaluate Taylor series
603  s = _mm256_mul_ps(
604  _mm256_fmadd_ps(
605  _mm256_fmsub_ps(
606  _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
607  s,
608  cp1),
609  s);
610 
611  for (i = 0; i < 3; i++) {
612  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
613  }
614  s = _mm256_div_ps(s, ftwos);
615 
616  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
617  cosine = _mm256_sub_ps(fones, s);
618 
619  condition1 = _mm256_cmp_ps(
620  _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
621  fzeroes,
622  _CMP_NEQ_UQ);
623  condition2 = _mm256_cmp_ps(
624  _mm256_cmp_ps(
625  _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
626  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
627  _CMP_NEQ_UQ);
628  // Need this condition only for cos
629  // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
630  // twos), fours)), fzeroes);
631 
632  sine =
633  _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
634  sine = _mm256_sub_ps(
635  sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
636  _mm256_storeu_ps(bPtr, sine);
637  aPtr += 8;
638  bPtr += 8;
639  }
640 
641  number = eighthPoints * 8;
642  for (; number < num_points; number++) {
643  *bPtr++ = sin(*aPtr++);
644  }
645 }
646 
647 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
648 
649 #ifdef LV_HAVE_AVX2
650 #include <immintrin.h>
651 
652 static inline void
653 volk_32f_sin_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
654 {
655  float* bPtr = bVector;
656  const float* aPtr = aVector;
657 
658  unsigned int number = 0;
659  unsigned int eighthPoints = num_points / 8;
660  unsigned int i = 0;
661 
662  __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
663  fzeroes;
664  __m256 sine, cosine, condition1, condition2;
665  __m256i q, r, ones, twos, fours;
666 
667  m4pi = _mm256_set1_ps(1.273239545);
668  pio4A = _mm256_set1_ps(0.78515625);
669  pio4B = _mm256_set1_ps(0.241876e-3);
670  ffours = _mm256_set1_ps(4.0);
671  ftwos = _mm256_set1_ps(2.0);
672  fones = _mm256_set1_ps(1.0);
673  fzeroes = _mm256_setzero_ps();
674  ones = _mm256_set1_epi32(1);
675  twos = _mm256_set1_epi32(2);
676  fours = _mm256_set1_epi32(4);
677 
678  cp1 = _mm256_set1_ps(1.0);
679  cp2 = _mm256_set1_ps(0.83333333e-1);
680  cp3 = _mm256_set1_ps(0.2777778e-2);
681  cp4 = _mm256_set1_ps(0.49603e-4);
682  cp5 = _mm256_set1_ps(0.551e-6);
683 
684  for (; number < eighthPoints; number++) {
685  aVal = _mm256_loadu_ps(aPtr);
686  s = _mm256_sub_ps(aVal,
687  _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
688  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
689  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
690  r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
691 
692  s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
693  s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
694 
695  s = _mm256_div_ps(
696  s,
697  _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
698  s = _mm256_mul_ps(s, s);
699  // Evaluate Taylor series
700  s = _mm256_mul_ps(
701  _mm256_add_ps(
702  _mm256_mul_ps(
703  _mm256_sub_ps(
704  _mm256_mul_ps(
705  _mm256_add_ps(
706  _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
707  s),
708  cp3),
709  s),
710  cp2),
711  s),
712  cp1),
713  s);
714 
715  for (i = 0; i < 3; i++) {
716  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
717  }
718  s = _mm256_div_ps(s, ftwos);
719 
720  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
721  cosine = _mm256_sub_ps(fones, s);
722 
723  condition1 = _mm256_cmp_ps(
724  _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
725  fzeroes,
726  _CMP_NEQ_UQ);
727  condition2 = _mm256_cmp_ps(
728  _mm256_cmp_ps(
729  _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
730  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
731  _CMP_NEQ_UQ);
732  // Need this condition only for cos
733  // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
734  // twos), fours)), fzeroes);
735 
736  sine =
737  _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
738  sine = _mm256_sub_ps(
739  sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
740  _mm256_storeu_ps(bPtr, sine);
741  aPtr += 8;
742  bPtr += 8;
743  }
744 
745  number = eighthPoints * 8;
746  for (; number < num_points; number++) {
747  *bPtr++ = sin(*aPtr++);
748  }
749 }
750 
751 #endif /* LV_HAVE_AVX2 for unaligned */
752 
753 
754 #ifdef LV_HAVE_SSE4_1
755 #include <smmintrin.h>
756 
757 static inline void
758 volk_32f_sin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
759 {
760  float* bPtr = bVector;
761  const float* aPtr = aVector;
762 
763  unsigned int number = 0;
764  unsigned int quarterPoints = num_points / 4;
765  unsigned int i = 0;
766 
767  __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
768  fzeroes;
769  __m128 sine, cosine, condition1, condition2;
770  __m128i q, r, ones, twos, fours;
771 
772  m4pi = _mm_set1_ps(1.273239545);
773  pio4A = _mm_set1_ps(0.78515625);
774  pio4B = _mm_set1_ps(0.241876e-3);
775  ffours = _mm_set1_ps(4.0);
776  ftwos = _mm_set1_ps(2.0);
777  fones = _mm_set1_ps(1.0);
778  fzeroes = _mm_setzero_ps();
779  ones = _mm_set1_epi32(1);
780  twos = _mm_set1_epi32(2);
781  fours = _mm_set1_epi32(4);
782 
783  cp1 = _mm_set1_ps(1.0);
784  cp2 = _mm_set1_ps(0.83333333e-1);
785  cp3 = _mm_set1_ps(0.2777778e-2);
786  cp4 = _mm_set1_ps(0.49603e-4);
787  cp5 = _mm_set1_ps(0.551e-6);
788 
789  for (; number < quarterPoints; number++) {
790  aVal = _mm_loadu_ps(aPtr);
791  s = _mm_sub_ps(aVal,
792  _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
793  q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
794  r = _mm_add_epi32(q, _mm_and_si128(q, ones));
795 
796  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
797  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
798 
799  s = _mm_div_ps(
800  s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
801  s = _mm_mul_ps(s, s);
802  // Evaluate Taylor series
803  s = _mm_mul_ps(
804  _mm_add_ps(
805  _mm_mul_ps(
806  _mm_sub_ps(
807  _mm_mul_ps(
808  _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
809  cp3),
810  s),
811  cp2),
812  s),
813  cp1),
814  s);
815 
816  for (i = 0; i < 3; i++) {
817  s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
818  }
819  s = _mm_div_ps(s, ftwos);
820 
821  sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
822  cosine = _mm_sub_ps(fones, s);
823 
824  condition1 = _mm_cmpneq_ps(
825  _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
826  condition2 = _mm_cmpneq_ps(
827  _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
828  _mm_cmplt_ps(aVal, fzeroes));
829 
830  sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
831  sine =
832  _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
833  _mm_storeu_ps(bPtr, sine);
834  aPtr += 4;
835  bPtr += 4;
836  }
837 
838  number = quarterPoints * 4;
839  for (; number < num_points; number++) {
840  *bPtr++ = sinf(*aPtr++);
841  }
842 }
843 
844 #endif /* LV_HAVE_SSE4_1 for unaligned */
845 
846 
847 #ifdef LV_HAVE_GENERIC
848 
849 static inline void
850 volk_32f_sin_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
851 {
852  float* bPtr = bVector;
853  const float* aPtr = aVector;
854  unsigned int number = 0;
855 
856  for (number = 0; number < num_points; number++) {
857  *bPtr++ = sinf(*aPtr++);
858  }
859 }
860 
861 #endif /* LV_HAVE_GENERIC */
862 
863 
864 #ifdef LV_HAVE_NEON
865 #include <arm_neon.h>
867 
868 static inline void
869 volk_32f_sin_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
870 {
871  unsigned int number = 0;
872  unsigned int quarter_points = num_points / 4;
873  float* bVectorPtr = bVector;
874  const float* aVectorPtr = aVector;
875 
876  float32x4_t b_vec;
877  float32x4_t a_vec;
878 
879  for (number = 0; number < quarter_points; number++) {
880  a_vec = vld1q_f32(aVectorPtr);
881  // Prefetch next one, speeds things up
882  __VOLK_PREFETCH(aVectorPtr + 4);
883  b_vec = _vsinq_f32(a_vec);
884  vst1q_f32(bVectorPtr, b_vec);
885  // move pointers ahead
886  bVectorPtr += 4;
887  aVectorPtr += 4;
888  }
889 
890  // Deal with the rest
891  for (number = quarter_points * 4; number < num_points; number++) {
892  *bVectorPtr++ = sinf(*aVectorPtr++);
893  }
894 }
895 
896 #endif /* LV_HAVE_NEON */
897 
898 #ifdef LV_HAVE_RVV
899 #include <riscv_vector.h>
900 
901 static inline void
902 volk_32f_sin_32f_rvv(float* bVector, const float* aVector, unsigned int num_points)
903 {
904  size_t vlmax = __riscv_vsetvlmax_e32m2();
905 
906  const vfloat32m2_t c4oPi = __riscv_vfmv_v_f_f32m2(1.2732395f, vlmax);
907  const vfloat32m2_t cPio4a = __riscv_vfmv_v_f_f32m2(0.7853982f, vlmax);
908  const vfloat32m2_t cPio4b = __riscv_vfmv_v_f_f32m2(7.946627e-09f, vlmax);
909  const vfloat32m2_t cPio4c = __riscv_vfmv_v_f_f32m2(3.061617e-17f, vlmax);
910 
911  const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax);
912  const vfloat32m2_t cf4 = __riscv_vfmv_v_f_f32m2(4.0f, vlmax);
913 
914  const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(0.0833333333f, vlmax);
915  const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(0.0027777778f, vlmax);
916  const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(4.9603175e-05, vlmax);
917  const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(5.5114638e-07, vlmax);
918 
919  size_t n = num_points;
920  for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) {
921  vl = __riscv_vsetvl_e32m2(n);
922  vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl);
923  vfloat32m2_t s = __riscv_vfabs(v, vl);
924  vint32m2_t q = __riscv_vfcvt_x(__riscv_vfmul(s, c4oPi, vl), vl);
925  vfloat32m2_t r = __riscv_vfcvt_f(__riscv_vadd(q, __riscv_vand(q, 1, vl), vl), vl);
926 
927  s = __riscv_vfnmsac(s, cPio4a, r, vl);
928  s = __riscv_vfnmsac(s, cPio4b, r, vl);
929  s = __riscv_vfnmsac(s, cPio4c, r, vl);
930 
931  s = __riscv_vfmul(s, 1 / 8.0f, vl);
932  s = __riscv_vfmul(s, s, vl);
933  vfloat32m2_t t = s;
934  s = __riscv_vfmsub(s, c5, c4, vl);
935  s = __riscv_vfmadd(s, t, c3, vl);
936  s = __riscv_vfmsub(s, t, c2, vl);
937  s = __riscv_vfmadd(s, t, cf1, vl);
938  s = __riscv_vfmul(s, t, vl);
939  s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
940  s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
941  s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
942  s = __riscv_vfmul(s, 1 / 2.0f, vl);
943 
944  vfloat32m2_t sine =
945  __riscv_vfsqrt(__riscv_vfmul(__riscv_vfrsub(s, 2.0f, vl), s, vl), vl);
946  vfloat32m2_t cosine = __riscv_vfsub(cf1, s, vl);
947 
948  vbool16_t m1 = __riscv_vmsne(__riscv_vand(__riscv_vadd(q, 1, vl), 2, vl), 0, vl);
949  vbool16_t m2 = __riscv_vmxor(__riscv_vmslt(__riscv_vreinterpret_i32m2(v), 0, vl),
950  __riscv_vmsne(__riscv_vand(q, 4, vl), 0, vl),
951  vl);
952 
953  sine = __riscv_vmerge(sine, cosine, m1, vl);
954  sine = __riscv_vfneg_mu(m2, sine, sine, vl);
955 
956  __riscv_vse32(bVector, sine, vl);
957  }
958 }
959 #endif /*LV_HAVE_RVV*/
960 
961 #endif /* INCLUDED_volk_32f_sin_32f_u_H */
static void volk_32f_sin_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sin_32f.h:850
static void volk_32f_sin_32f_neon(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sin_32f.h:869
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
for i
Definition: volk_config_fixed.tmpl.h:13
static float32x4_t _vsinq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:249