Vector Optimized Library of Kernels  3.2.0
Architecture-tuned implementations of math kernels
volk_32f_cos_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
59 #include <inttypes.h>
60 #include <math.h>
61 #include <stdio.h>
62 
63 #ifndef INCLUDED_volk_32f_cos_32f_a_H
64 #define INCLUDED_volk_32f_cos_32f_a_H
65 
66 #ifdef LV_HAVE_AVX512F
67 
68 #include <immintrin.h>
69 static inline void volk_32f_cos_32f_a_avx512f(float* cosVector,
70  const float* inVector,
71  unsigned int num_points)
72 {
73  float* cosPtr = cosVector;
74  const float* inPtr = inVector;
75 
76  unsigned int number = 0;
77  unsigned int sixteenPoints = num_points / 16;
78  unsigned int i = 0;
79 
80  __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
81  fones, sine, cosine;
82  __m512i q, zeros, ones, twos, fours;
83 
84  m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125);
85  pio4A = _mm512_set1_ps(0.7853981554508209228515625);
86  pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8);
87  pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
88  ffours = _mm512_set1_ps(4.0);
89  ftwos = _mm512_set1_ps(2.0);
90  fones = _mm512_set1_ps(1.0);
91  zeros = _mm512_setzero_epi32();
92  ones = _mm512_set1_epi32(1);
93  twos = _mm512_set1_epi32(2);
94  fours = _mm512_set1_epi32(4);
95 
96  cp1 = _mm512_set1_ps(1.0);
97  cp2 = _mm512_set1_ps(0.08333333333333333);
98  cp3 = _mm512_set1_ps(0.002777777777777778);
99  cp4 = _mm512_set1_ps(4.96031746031746e-05);
100  cp5 = _mm512_set1_ps(5.511463844797178e-07);
101  __mmask16 condition1, condition2;
102 
103  for (; number < sixteenPoints; number++) {
104  aVal = _mm512_load_ps(inPtr);
105  // s = fabs(aVal)
106  s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff)));
107 
108  // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
109  q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi)));
110  // r = q + q&1, q indicates quadrant, r gives
111  r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones)));
112 
113  s = _mm512_fnmadd_ps(r, pio4A, s);
114  s = _mm512_fnmadd_ps(r, pio4B, s);
115  s = _mm512_fnmadd_ps(r, pio4C, s);
116 
117  s = _mm512_div_ps(
118  s,
119  _mm512_set1_ps(8.0f)); // The constant is 2^N, for 3 times argument reduction
120  s = _mm512_mul_ps(s, s);
121  // Evaluate Taylor series
122  s = _mm512_mul_ps(
123  _mm512_fmadd_ps(
124  _mm512_fmsub_ps(
125  _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
126  s,
127  cp1),
128  s);
129 
130  for (i = 0; i < 3; i++) {
131  s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
132  }
133  s = _mm512_div_ps(s, ftwos);
134 
135  sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
136  cosine = _mm512_sub_ps(fones, s);
137 
138  // if(((q+1)&2) != 0) { cosine=sine;}
139  condition1 = _mm512_cmpneq_epi32_mask(
140  _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros);
141 
142  // if(((q+2)&4) != 0) { cosine = -cosine;}
143  condition2 = _mm512_cmpneq_epi32_mask(
144  _mm512_and_si512(_mm512_add_epi32(q, twos), fours), zeros);
145  cosine = _mm512_mask_blend_ps(condition1, cosine, sine);
146  cosine = _mm512_mask_mul_ps(cosine, condition2, cosine, _mm512_set1_ps(-1.f));
147  _mm512_store_ps(cosPtr, cosine);
148  inPtr += 16;
149  cosPtr += 16;
150  }
151 
152  number = sixteenPoints * 16;
153  for (; number < num_points; number++) {
154  *cosPtr++ = cosf(*inPtr++);
155  }
156 }
157 #endif
158 
159 #if LV_HAVE_AVX2 && LV_HAVE_FMA
160 #include <immintrin.h>
161 
162 static inline void
163 volk_32f_cos_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
164 {
165  float* bPtr = bVector;
166  const float* aPtr = aVector;
167 
168  unsigned int number = 0;
169  unsigned int eighthPoints = num_points / 8;
170  unsigned int i = 0;
171 
172  __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
173  fones, fzeroes;
174  __m256 sine, cosine;
175  __m256i q, ones, twos, fours;
176 
177  m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
178  pio4A = _mm256_set1_ps(0.7853981554508209228515625);
179  pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
180  pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
181  ffours = _mm256_set1_ps(4.0);
182  ftwos = _mm256_set1_ps(2.0);
183  fones = _mm256_set1_ps(1.0);
184  fzeroes = _mm256_setzero_ps();
185  __m256i zeroes = _mm256_set1_epi32(0);
186  ones = _mm256_set1_epi32(1);
187  __m256i allones = _mm256_set1_epi32(0xffffffff);
188  twos = _mm256_set1_epi32(2);
189  fours = _mm256_set1_epi32(4);
190 
191  cp1 = _mm256_set1_ps(1.0);
192  cp2 = _mm256_set1_ps(0.08333333333333333);
193  cp3 = _mm256_set1_ps(0.002777777777777778);
194  cp4 = _mm256_set1_ps(4.96031746031746e-05);
195  cp5 = _mm256_set1_ps(5.511463844797178e-07);
196  union bit256 condition1;
197  union bit256 condition3;
198 
199  for (; number < eighthPoints; number++) {
200 
201  aVal = _mm256_load_ps(aPtr);
202  // s = fabs(aVal)
203  s = _mm256_sub_ps(aVal,
204  _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
205  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
206  // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
207  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
208  // r = q + q&1, q indicates quadrant, r gives
209  r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
210 
211  s = _mm256_fnmadd_ps(r, pio4A, s);
212  s = _mm256_fnmadd_ps(r, pio4B, s);
213  s = _mm256_fnmadd_ps(r, pio4C, s);
214 
215  s = _mm256_div_ps(
216  s,
217  _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
218  s = _mm256_mul_ps(s, s);
219  // Evaluate Taylor series
220  s = _mm256_mul_ps(
221  _mm256_fmadd_ps(
222  _mm256_fmsub_ps(
223  _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
224  s,
225  cp1),
226  s);
227 
228  for (i = 0; i < 3; i++) {
229  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
230  }
231  s = _mm256_div_ps(s, ftwos);
232 
233  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
234  cosine = _mm256_sub_ps(fones, s);
235 
236  // if(((q+1)&2) != 0) { cosine=sine;}
237  condition1.int_vec =
238  _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
239  condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
240 
241  // if(((q+2)&4) != 0) { cosine = -cosine;}
242  condition3.int_vec = _mm256_cmpeq_epi32(
243  _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
244  condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
245 
246  cosine = _mm256_add_ps(
247  cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
248  cosine = _mm256_sub_ps(cosine,
249  _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
250  condition3.float_vec));
251  _mm256_store_ps(bPtr, cosine);
252  aPtr += 8;
253  bPtr += 8;
254  }
255 
256  number = eighthPoints * 8;
257  for (; number < num_points; number++) {
258  *bPtr++ = cos(*aPtr++);
259  }
260 }
261 
262 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
263 
264 #ifdef LV_HAVE_AVX2
265 #include <immintrin.h>
266 
267 static inline void
268 volk_32f_cos_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
269 {
270  float* bPtr = bVector;
271  const float* aPtr = aVector;
272 
273  unsigned int number = 0;
274  unsigned int eighthPoints = num_points / 8;
275  unsigned int i = 0;
276 
277  __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
278  fones, fzeroes;
279  __m256 sine, cosine;
280  __m256i q, ones, twos, fours;
281 
282  m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
283  pio4A = _mm256_set1_ps(0.7853981554508209228515625);
284  pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
285  pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
286  ffours = _mm256_set1_ps(4.0);
287  ftwos = _mm256_set1_ps(2.0);
288  fones = _mm256_set1_ps(1.0);
289  fzeroes = _mm256_setzero_ps();
290  __m256i zeroes = _mm256_set1_epi32(0);
291  ones = _mm256_set1_epi32(1);
292  __m256i allones = _mm256_set1_epi32(0xffffffff);
293  twos = _mm256_set1_epi32(2);
294  fours = _mm256_set1_epi32(4);
295 
296  cp1 = _mm256_set1_ps(1.0);
297  cp2 = _mm256_set1_ps(0.08333333333333333);
298  cp3 = _mm256_set1_ps(0.002777777777777778);
299  cp4 = _mm256_set1_ps(4.96031746031746e-05);
300  cp5 = _mm256_set1_ps(5.511463844797178e-07);
301  union bit256 condition1;
302  union bit256 condition3;
303 
304  for (; number < eighthPoints; number++) {
305 
306  aVal = _mm256_load_ps(aPtr);
307  // s = fabs(aVal)
308  s = _mm256_sub_ps(aVal,
309  _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
310  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
311  // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
312  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
313  // r = q + q&1, q indicates quadrant, r gives
314  r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
315 
316  s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4A));
317  s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4B));
318  s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4C));
319 
320  s = _mm256_div_ps(
321  s,
322  _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
323  s = _mm256_mul_ps(s, s);
324  // Evaluate Taylor series
325  s = _mm256_mul_ps(
326  _mm256_add_ps(
327  _mm256_mul_ps(
328  _mm256_sub_ps(
329  _mm256_mul_ps(
330  _mm256_add_ps(
331  _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
332  s),
333  cp3),
334  s),
335  cp2),
336  s),
337  cp1),
338  s);
339 
340  for (i = 0; i < 3; i++) {
341  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
342  }
343  s = _mm256_div_ps(s, ftwos);
344 
345  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
346  cosine = _mm256_sub_ps(fones, s);
347 
348  // if(((q+1)&2) != 0) { cosine=sine;}
349  condition1.int_vec =
350  _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
351  condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
352 
353  // if(((q+2)&4) != 0) { cosine = -cosine;}
354  condition3.int_vec = _mm256_cmpeq_epi32(
355  _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
356  condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
357 
358  cosine = _mm256_add_ps(
359  cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
360  cosine = _mm256_sub_ps(cosine,
361  _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
362  condition3.float_vec));
363  _mm256_store_ps(bPtr, cosine);
364  aPtr += 8;
365  bPtr += 8;
366  }
367 
368  number = eighthPoints * 8;
369  for (; number < num_points; number++) {
370  *bPtr++ = cos(*aPtr++);
371  }
372 }
373 
374 #endif /* LV_HAVE_AVX2 for aligned */
375 
376 #ifdef LV_HAVE_SSE4_1
377 #include <smmintrin.h>
378 
379 static inline void
380 volk_32f_cos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
381 {
382  float* bPtr = bVector;
383  const float* aPtr = aVector;
384 
385  unsigned int number = 0;
386  unsigned int quarterPoints = num_points / 4;
387  unsigned int i = 0;
388 
389  __m128 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
390  fones, fzeroes;
391  __m128 sine, cosine;
392  __m128i q, ones, twos, fours;
393 
394  m4pi = _mm_set1_ps(1.273239544735162542821171882678754627704620361328125);
395  pio4A = _mm_set1_ps(0.7853981554508209228515625);
396  pio4B = _mm_set1_ps(0.794662735614792836713604629039764404296875e-8);
397  pio4C = _mm_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
398  ffours = _mm_set1_ps(4.0);
399  ftwos = _mm_set1_ps(2.0);
400  fones = _mm_set1_ps(1.0);
401  fzeroes = _mm_setzero_ps();
402  __m128i zeroes = _mm_set1_epi32(0);
403  ones = _mm_set1_epi32(1);
404  __m128i allones = _mm_set1_epi32(0xffffffff);
405  twos = _mm_set1_epi32(2);
406  fours = _mm_set1_epi32(4);
407 
408  cp1 = _mm_set1_ps(1.0);
409  cp2 = _mm_set1_ps(0.08333333333333333);
410  cp3 = _mm_set1_ps(0.002777777777777778);
411  cp4 = _mm_set1_ps(4.96031746031746e-05);
412  cp5 = _mm_set1_ps(5.511463844797178e-07);
413  union bit128 condition1;
414  union bit128 condition3;
415 
416  for (; number < quarterPoints; number++) {
417 
418  aVal = _mm_load_ps(aPtr);
419  // s = fabs(aVal)
420  s = _mm_sub_ps(aVal,
421  _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
422  // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
423  q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
424  // r = q + q&1, q indicates quadrant, r gives
425  r = _mm_cvtepi32_ps(_mm_add_epi32(q, _mm_and_si128(q, ones)));
426 
427  s = _mm_sub_ps(s, _mm_mul_ps(r, pio4A));
428  s = _mm_sub_ps(s, _mm_mul_ps(r, pio4B));
429  s = _mm_sub_ps(s, _mm_mul_ps(r, pio4C));
430 
431  s = _mm_div_ps(
432  s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
433  s = _mm_mul_ps(s, s);
434  // Evaluate Taylor series
435  s = _mm_mul_ps(
436  _mm_add_ps(
437  _mm_mul_ps(
438  _mm_sub_ps(
439  _mm_mul_ps(
440  _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
441  cp3),
442  s),
443  cp2),
444  s),
445  cp1),
446  s);
447 
448  for (i = 0; i < 3; i++) {
449  s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
450  }
451  s = _mm_div_ps(s, ftwos);
452 
453  sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
454  cosine = _mm_sub_ps(fones, s);
455 
456  // if(((q+1)&2) != 0) { cosine=sine;}
457  condition1.int_vec =
458  _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, ones), twos), zeroes);
459  condition1.int_vec = _mm_xor_si128(allones, condition1.int_vec);
460 
461  // if(((q+2)&4) != 0) { cosine = -cosine;}
462  condition3.int_vec =
463  _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, twos), fours), zeroes);
464  condition3.int_vec = _mm_xor_si128(allones, condition3.int_vec);
465 
466  cosine = _mm_add_ps(cosine,
467  _mm_and_ps(_mm_sub_ps(sine, cosine), condition1.float_vec));
468  cosine = _mm_sub_ps(
469  cosine,
470  _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3.float_vec));
471  _mm_store_ps(bPtr, cosine);
472  aPtr += 4;
473  bPtr += 4;
474  }
475 
476  number = quarterPoints * 4;
477  for (; number < num_points; number++) {
478  *bPtr++ = cosf(*aPtr++);
479  }
480 }
481 
482 #endif /* LV_HAVE_SSE4_1 for aligned */
483 
484 #endif /* INCLUDED_volk_32f_cos_32f_a_H */
485 
486 
487 #ifndef INCLUDED_volk_32f_cos_32f_u_H
488 #define INCLUDED_volk_32f_cos_32f_u_H
489 
490 #ifdef LV_HAVE_AVX512F
491 
492 #include <immintrin.h>
493 static inline void volk_32f_cos_32f_u_avx512f(float* cosVector,
494  const float* inVector,
495  unsigned int num_points)
496 {
497  float* cosPtr = cosVector;
498  const float* inPtr = inVector;
499 
500  unsigned int number = 0;
501  unsigned int sixteenPoints = num_points / 16;
502  unsigned int i = 0;
503 
504  __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
505  fones, sine, cosine;
506  __m512i q, zeros, ones, twos, fours;
507 
508  m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125);
509  pio4A = _mm512_set1_ps(0.7853981554508209228515625);
510  pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8);
511  pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
512  ffours = _mm512_set1_ps(4.0);
513  ftwos = _mm512_set1_ps(2.0);
514  fones = _mm512_set1_ps(1.0);
515  zeros = _mm512_setzero_epi32();
516  ones = _mm512_set1_epi32(1);
517  twos = _mm512_set1_epi32(2);
518  fours = _mm512_set1_epi32(4);
519 
520  cp1 = _mm512_set1_ps(1.0);
521  cp2 = _mm512_set1_ps(0.08333333333333333);
522  cp3 = _mm512_set1_ps(0.002777777777777778);
523  cp4 = _mm512_set1_ps(4.96031746031746e-05);
524  cp5 = _mm512_set1_ps(5.511463844797178e-07);
525  __mmask16 condition1, condition2;
526  for (; number < sixteenPoints; number++) {
527  aVal = _mm512_loadu_ps(inPtr);
528  // s = fabs(aVal)
529  s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff)));
530 
531  // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
532  q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi)));
533  // r = q + q&1, q indicates quadrant, r gives
534  r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones)));
535 
536  s = _mm512_fnmadd_ps(r, pio4A, s);
537  s = _mm512_fnmadd_ps(r, pio4B, s);
538  s = _mm512_fnmadd_ps(r, pio4C, s);
539 
540  s = _mm512_div_ps(
541  s,
542  _mm512_set1_ps(8.0f)); // The constant is 2^N, for 3 times argument reduction
543  s = _mm512_mul_ps(s, s);
544  // Evaluate Taylor series
545  s = _mm512_mul_ps(
546  _mm512_fmadd_ps(
547  _mm512_fmsub_ps(
548  _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
549  s,
550  cp1),
551  s);
552 
553  for (i = 0; i < 3; i++) {
554  s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
555  }
556  s = _mm512_div_ps(s, ftwos);
557 
558  sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
559  cosine = _mm512_sub_ps(fones, s);
560 
561  // if(((q+1)&2) != 0) { cosine=sine;}
562  condition1 = _mm512_cmpneq_epi32_mask(
563  _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros);
564 
565  // if(((q+2)&4) != 0) { cosine = -cosine;}
566  condition2 = _mm512_cmpneq_epi32_mask(
567  _mm512_and_si512(_mm512_add_epi32(q, twos), fours), zeros);
568 
569  cosine = _mm512_mask_blend_ps(condition1, cosine, sine);
570  cosine = _mm512_mask_mul_ps(cosine, condition2, cosine, _mm512_set1_ps(-1.f));
571  _mm512_storeu_ps(cosPtr, cosine);
572  inPtr += 16;
573  cosPtr += 16;
574  }
575 
576  number = sixteenPoints * 16;
577  for (; number < num_points; number++) {
578  *cosPtr++ = cosf(*inPtr++);
579  }
580 }
581 #endif
582 
583 #if LV_HAVE_AVX2 && LV_HAVE_FMA
584 #include <immintrin.h>
585 
586 static inline void
587 volk_32f_cos_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
588 {
589  float* bPtr = bVector;
590  const float* aPtr = aVector;
591 
592  unsigned int number = 0;
593  unsigned int eighthPoints = num_points / 8;
594  unsigned int i = 0;
595 
596  __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
597  fones, fzeroes;
598  __m256 sine, cosine;
599  __m256i q, ones, twos, fours;
600 
601  m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
602  pio4A = _mm256_set1_ps(0.7853981554508209228515625);
603  pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
604  pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
605  ffours = _mm256_set1_ps(4.0);
606  ftwos = _mm256_set1_ps(2.0);
607  fones = _mm256_set1_ps(1.0);
608  fzeroes = _mm256_setzero_ps();
609  __m256i zeroes = _mm256_set1_epi32(0);
610  ones = _mm256_set1_epi32(1);
611  __m256i allones = _mm256_set1_epi32(0xffffffff);
612  twos = _mm256_set1_epi32(2);
613  fours = _mm256_set1_epi32(4);
614 
615  cp1 = _mm256_set1_ps(1.0);
616  cp2 = _mm256_set1_ps(0.08333333333333333);
617  cp3 = _mm256_set1_ps(0.002777777777777778);
618  cp4 = _mm256_set1_ps(4.96031746031746e-05);
619  cp5 = _mm256_set1_ps(5.511463844797178e-07);
620  union bit256 condition1;
621  union bit256 condition3;
622 
623  for (; number < eighthPoints; number++) {
624 
625  aVal = _mm256_loadu_ps(aPtr);
626  // s = fabs(aVal)
627  s = _mm256_sub_ps(aVal,
628  _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
629  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
630  // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
631  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
632  // r = q + q&1, q indicates quadrant, r gives
633  r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
634 
635  s = _mm256_fnmadd_ps(r, pio4A, s);
636  s = _mm256_fnmadd_ps(r, pio4B, s);
637  s = _mm256_fnmadd_ps(r, pio4C, s);
638 
639  s = _mm256_div_ps(
640  s,
641  _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
642  s = _mm256_mul_ps(s, s);
643  // Evaluate Taylor series
644  s = _mm256_mul_ps(
645  _mm256_fmadd_ps(
646  _mm256_fmsub_ps(
647  _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
648  s,
649  cp1),
650  s);
651 
652  for (i = 0; i < 3; i++) {
653  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
654  }
655  s = _mm256_div_ps(s, ftwos);
656 
657  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
658  cosine = _mm256_sub_ps(fones, s);
659 
660  // if(((q+1)&2) != 0) { cosine=sine;}
661  condition1.int_vec =
662  _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
663  condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
664 
665  // if(((q+2)&4) != 0) { cosine = -cosine;}
666  condition3.int_vec = _mm256_cmpeq_epi32(
667  _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
668  condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
669 
670  cosine = _mm256_add_ps(
671  cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
672  cosine = _mm256_sub_ps(cosine,
673  _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
674  condition3.float_vec));
675  _mm256_storeu_ps(bPtr, cosine);
676  aPtr += 8;
677  bPtr += 8;
678  }
679 
680  number = eighthPoints * 8;
681  for (; number < num_points; number++) {
682  *bPtr++ = cos(*aPtr++);
683  }
684 }
685 
686 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
687 
688 #ifdef LV_HAVE_AVX2
689 #include <immintrin.h>
690 
691 static inline void
692 volk_32f_cos_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
693 {
694  float* bPtr = bVector;
695  const float* aPtr = aVector;
696 
697  unsigned int number = 0;
698  unsigned int eighthPoints = num_points / 8;
699  unsigned int i = 0;
700 
701  __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
702  fones, fzeroes;
703  __m256 sine, cosine;
704  __m256i q, ones, twos, fours;
705 
706  m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
707  pio4A = _mm256_set1_ps(0.7853981554508209228515625);
708  pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
709  pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
710  ffours = _mm256_set1_ps(4.0);
711  ftwos = _mm256_set1_ps(2.0);
712  fones = _mm256_set1_ps(1.0);
713  fzeroes = _mm256_setzero_ps();
714  __m256i zeroes = _mm256_set1_epi32(0);
715  ones = _mm256_set1_epi32(1);
716  __m256i allones = _mm256_set1_epi32(0xffffffff);
717  twos = _mm256_set1_epi32(2);
718  fours = _mm256_set1_epi32(4);
719 
720  cp1 = _mm256_set1_ps(1.0);
721  cp2 = _mm256_set1_ps(0.08333333333333333);
722  cp3 = _mm256_set1_ps(0.002777777777777778);
723  cp4 = _mm256_set1_ps(4.96031746031746e-05);
724  cp5 = _mm256_set1_ps(5.511463844797178e-07);
725  union bit256 condition1;
726  union bit256 condition3;
727 
728  for (; number < eighthPoints; number++) {
729 
730  aVal = _mm256_loadu_ps(aPtr);
731  // s = fabs(aVal)
732  s = _mm256_sub_ps(aVal,
733  _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
734  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
735  // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
736  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
737  // r = q + q&1, q indicates quadrant, r gives
738  r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
739 
740  s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4A));
741  s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4B));
742  s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4C));
743 
744  s = _mm256_div_ps(
745  s,
746  _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
747  s = _mm256_mul_ps(s, s);
748  // Evaluate Taylor series
749  s = _mm256_mul_ps(
750  _mm256_add_ps(
751  _mm256_mul_ps(
752  _mm256_sub_ps(
753  _mm256_mul_ps(
754  _mm256_add_ps(
755  _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
756  s),
757  cp3),
758  s),
759  cp2),
760  s),
761  cp1),
762  s);
763 
764  for (i = 0; i < 3; i++) {
765  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
766  }
767  s = _mm256_div_ps(s, ftwos);
768 
769  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
770  cosine = _mm256_sub_ps(fones, s);
771 
772  // if(((q+1)&2) != 0) { cosine=sine;}
773  condition1.int_vec =
774  _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
775  condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
776 
777  // if(((q+2)&4) != 0) { cosine = -cosine;}
778  condition3.int_vec = _mm256_cmpeq_epi32(
779  _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
780  condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
781 
782  cosine = _mm256_add_ps(
783  cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
784  cosine = _mm256_sub_ps(cosine,
785  _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
786  condition3.float_vec));
787  _mm256_storeu_ps(bPtr, cosine);
788  aPtr += 8;
789  bPtr += 8;
790  }
791 
792  number = eighthPoints * 8;
793  for (; number < num_points; number++) {
794  *bPtr++ = cos(*aPtr++);
795  }
796 }
797 
798 #endif /* LV_HAVE_AVX2 for unaligned */
799 
800 #ifdef LV_HAVE_SSE4_1
801 #include <smmintrin.h>
802 
803 static inline void
804 volk_32f_cos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
805 {
806  float* bPtr = bVector;
807  const float* aPtr = aVector;
808 
809  unsigned int number = 0;
810  unsigned int quarterPoints = num_points / 4;
811  unsigned int i = 0;
812 
813  __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
814  fzeroes;
815  __m128 sine, cosine, condition1, condition3;
816  __m128i q, r, ones, twos, fours;
817 
818  m4pi = _mm_set1_ps(1.273239545);
819  pio4A = _mm_set1_ps(0.78515625);
820  pio4B = _mm_set1_ps(0.241876e-3);
821  ffours = _mm_set1_ps(4.0);
822  ftwos = _mm_set1_ps(2.0);
823  fones = _mm_set1_ps(1.0);
824  fzeroes = _mm_setzero_ps();
825  ones = _mm_set1_epi32(1);
826  twos = _mm_set1_epi32(2);
827  fours = _mm_set1_epi32(4);
828 
829  cp1 = _mm_set1_ps(1.0);
830  cp2 = _mm_set1_ps(0.83333333e-1);
831  cp3 = _mm_set1_ps(0.2777778e-2);
832  cp4 = _mm_set1_ps(0.49603e-4);
833  cp5 = _mm_set1_ps(0.551e-6);
834 
835  for (; number < quarterPoints; number++) {
836  aVal = _mm_loadu_ps(aPtr);
837  s = _mm_sub_ps(aVal,
838  _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
839  q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
840  r = _mm_add_epi32(q, _mm_and_si128(q, ones));
841 
842  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
843  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
844 
845  s = _mm_div_ps(
846  s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
847  s = _mm_mul_ps(s, s);
848  // Evaluate Taylor series
849  s = _mm_mul_ps(
850  _mm_add_ps(
851  _mm_mul_ps(
852  _mm_sub_ps(
853  _mm_mul_ps(
854  _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
855  cp3),
856  s),
857  cp2),
858  s),
859  cp1),
860  s);
861 
862  for (i = 0; i < 3; i++) {
863  s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
864  }
865  s = _mm_div_ps(s, ftwos);
866 
867  sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
868  cosine = _mm_sub_ps(fones, s);
869 
870  condition1 = _mm_cmpneq_ps(
871  _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
872 
873  condition3 = _mm_cmpneq_ps(
874  _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
875 
876  cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
877  cosine = _mm_sub_ps(
878  cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
879  _mm_storeu_ps(bPtr, cosine);
880  aPtr += 4;
881  bPtr += 4;
882  }
883 
884  number = quarterPoints * 4;
885  for (; number < num_points; number++) {
886  *bPtr++ = cosf(*aPtr++);
887  }
888 }
889 
890 #endif /* LV_HAVE_SSE4_1 for unaligned */
891 
892 
893 #ifdef LV_HAVE_GENERIC
894 
895 /*
896  * For derivation see
897  * Shibata, Naoki, "Efficient evaluation methods of elementary functions
898  * suitable for SIMD computation," in Springer-Verlag 2010
899  */
900 static inline void volk_32f_cos_32f_generic_fast(float* bVector,
901  const float* aVector,
902  unsigned int num_points)
903 {
904  float* bPtr = bVector;
905  const float* aPtr = aVector;
906 
907  float m4pi = 1.273239544735162542821171882678754627704620361328125;
908  float pio4A = 0.7853981554508209228515625;
909  float pio4B = 0.794662735614792836713604629039764404296875e-8;
910  float pio4C = 0.306161699786838294306516483068750264552437361480769e-16;
911  int N = 3; // order of argument reduction
912 
913  unsigned int number;
914  for (number = 0; number < num_points; number++) {
915  float s = fabs(*aPtr);
916  int q = (int)(s * m4pi);
917  int r = q + (q & 1);
918  s -= r * pio4A;
919  s -= r * pio4B;
920  s -= r * pio4C;
921 
922  s = s * 0.125; // 2^-N (<--3)
923  s = s * s;
924  s = ((((s / 1814400. - 1.0 / 20160.0) * s + 1.0 / 360.0) * s - 1.0 / 12.0) * s +
925  1.0) *
926  s;
927 
928  int i;
929  for (i = 0; i < N; ++i) {
930  s = (4.0 - s) * s;
931  }
932  s = s / 2.0;
933 
934  float sine = sqrt((2.0 - s) * s);
935  float cosine = 1 - s;
936 
937  if (((q + 1) & 2) != 0) {
938  s = cosine;
939  cosine = sine;
940  sine = s;
941  }
942  if (((q + 2) & 4) != 0) {
943  cosine = -cosine;
944  }
945  *bPtr = cosine;
946  bPtr++;
947  aPtr++;
948  }
949 }
950 
951 #endif /* LV_HAVE_GENERIC */
952 
953 
954 #ifdef LV_HAVE_GENERIC
955 
956 static inline void
957 volk_32f_cos_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
958 {
959  float* bPtr = bVector;
960  const float* aPtr = aVector;
961  unsigned int number = 0;
962 
963  for (; number < num_points; number++) {
964  *bPtr++ = cosf(*aPtr++);
965  }
966 }
967 
968 #endif /* LV_HAVE_GENERIC */
969 
970 
971 #ifdef LV_HAVE_NEON
972 #include <arm_neon.h>
974 
975 static inline void
976 volk_32f_cos_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
977 {
978  unsigned int number = 0;
979  unsigned int quarter_points = num_points / 4;
980  float* bVectorPtr = bVector;
981  const float* aVectorPtr = aVector;
982 
983  float32x4_t b_vec;
984  float32x4_t a_vec;
985 
986  for (number = 0; number < quarter_points; number++) {
987  a_vec = vld1q_f32(aVectorPtr);
988  // Prefetch next one, speeds things up
989  __VOLK_PREFETCH(aVectorPtr + 4);
990  b_vec = _vcosq_f32(a_vec);
991  vst1q_f32(bVectorPtr, b_vec);
992  // move pointers ahead
993  bVectorPtr += 4;
994  aVectorPtr += 4;
995  }
996 
997  // Deal with the rest
998  for (number = quarter_points * 4; number < num_points; number++) {
999  *bVectorPtr++ = cosf(*aVectorPtr++);
1000  }
1001 }
1002 
1003 #endif /* LV_HAVE_NEON */
1004 
1005 #ifdef LV_HAVE_RVV
1006 #include <riscv_vector.h>
1007 
1008 static inline void
1009 volk_32f_cos_32f_rvv(float* bVector, const float* aVector, unsigned int num_points)
1010 {
1011  size_t vlmax = __riscv_vsetvlmax_e32m2();
1012 
1013  const vfloat32m2_t c4oPi = __riscv_vfmv_v_f_f32m2(1.2732395f, vlmax);
1014  const vfloat32m2_t cPio4a = __riscv_vfmv_v_f_f32m2(0.7853982f, vlmax);
1015  const vfloat32m2_t cPio4b = __riscv_vfmv_v_f_f32m2(7.946627e-09f, vlmax);
1016  const vfloat32m2_t cPio4c = __riscv_vfmv_v_f_f32m2(3.061617e-17f, vlmax);
1017 
1018  const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax);
1019  const vfloat32m2_t cf4 = __riscv_vfmv_v_f_f32m2(4.0f, vlmax);
1020 
1021  const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(0.0833333333f, vlmax);
1022  const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(0.0027777778f, vlmax);
1023  const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(4.9603175e-05f, vlmax);
1024  const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(5.5114638e-07f, vlmax);
1025 
1026  size_t n = num_points;
1027  for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) {
1028  vl = __riscv_vsetvl_e32m2(n);
1029  vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl);
1030  vfloat32m2_t s = __riscv_vfabs(v, vl);
1031  vint32m2_t q = __riscv_vfcvt_x(__riscv_vfmul(s, c4oPi, vl), vl);
1032  vfloat32m2_t r = __riscv_vfcvt_f(__riscv_vadd(q, __riscv_vand(q, 1, vl), vl), vl);
1033 
1034  s = __riscv_vfnmsac(s, cPio4a, r, vl);
1035  s = __riscv_vfnmsac(s, cPio4b, r, vl);
1036  s = __riscv_vfnmsac(s, cPio4c, r, vl);
1037 
1038  s = __riscv_vfmul(s, 1 / 8.0f, vl);
1039  s = __riscv_vfmul(s, s, vl);
1040  vfloat32m2_t t = s;
1041  s = __riscv_vfmsub(s, c5, c4, vl);
1042  s = __riscv_vfmadd(s, t, c3, vl);
1043  s = __riscv_vfmsub(s, t, c2, vl);
1044  s = __riscv_vfmadd(s, t, cf1, vl);
1045  s = __riscv_vfmul(s, t, vl);
1046  s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
1047  s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
1048  s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
1049  s = __riscv_vfmul(s, 1 / 2.0f, vl);
1050 
1051  vfloat32m2_t sine =
1052  __riscv_vfsqrt(__riscv_vfmul(__riscv_vfrsub(s, 2.0f, vl), s, vl), vl);
1053  vfloat32m2_t cosine = __riscv_vfsub(cf1, s, vl);
1054 
1055  vbool16_t m1 = __riscv_vmsne(__riscv_vand(__riscv_vadd(q, 1, vl), 2, vl), 0, vl);
1056  vbool16_t m2 = __riscv_vmsne(__riscv_vand(__riscv_vadd(q, 2, vl), 4, vl), 0, vl);
1057 
1058  cosine = __riscv_vmerge(cosine, sine, m1, vl);
1059  cosine = __riscv_vfneg_mu(m2, cosine, cosine, vl);
1060 
1061  __riscv_vse32(bVector, cosine, vl);
1062  }
1063 }
1064 #endif /*LV_HAVE_RVV*/
1065 
1066 #endif /* INCLUDED_volk_32f_cos_32f_u_H */
Definition: volk_common.h:116
Definition: volk_common.h:133
static void volk_32f_cos_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_cos_32f.h:957
static void volk_32f_cos_32f_neon(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_cos_32f.h:976
static void volk_32f_cos_32f_generic_fast(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_cos_32f.h:900
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
for i
Definition: volk_config_fixed.tmpl.h:13
static float32x4_t _vcosq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:255