Vector Optimized Library of Kernels  3.2.0
Architecture-tuned implementations of math kernels
volk_16i_32fc_dot_prod_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
45 #ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_H
46 #define INCLUDED_volk_16i_32fc_dot_prod_32fc_H
47 
48 #include <stdio.h>
49 #include <volk/volk_common.h>
50 
51 
52 #ifdef LV_HAVE_GENERIC
53 
55  const short* input,
56  const lv_32fc_t* taps,
57  unsigned int num_points)
58 {
59 
60  static const int N_UNROLL = 4;
61 
62  lv_32fc_t acc0 = 0;
63  lv_32fc_t acc1 = 0;
64  lv_32fc_t acc2 = 0;
65  lv_32fc_t acc3 = 0;
66 
67  unsigned i = 0;
68  unsigned n = (num_points / N_UNROLL) * N_UNROLL;
69 
70  for (i = 0; i < n; i += N_UNROLL) {
71  acc0 += taps[i + 0] * (float)input[i + 0];
72  acc1 += taps[i + 1] * (float)input[i + 1];
73  acc2 += taps[i + 2] * (float)input[i + 2];
74  acc3 += taps[i + 3] * (float)input[i + 3];
75  }
76 
77  for (; i < num_points; i++) {
78  acc0 += taps[i] * (float)input[i];
79  }
80 
81  *result = acc0 + acc1 + acc2 + acc3;
82 }
83 
84 #endif /*LV_HAVE_GENERIC*/
85 
86 #ifdef LV_HAVE_NEON
87 #include <arm_neon.h>
88 static inline void volk_16i_32fc_dot_prod_32fc_neon(lv_32fc_t* result,
89  const short* input,
90  const lv_32fc_t* taps,
91  unsigned int num_points)
92 {
93 
94  unsigned ii;
95  unsigned quarter_points = num_points / 4;
96  lv_32fc_t* tapsPtr = (lv_32fc_t*)taps;
97  short* inputPtr = (short*)input;
98  lv_32fc_t accumulator_vec[4];
99 
100  float32x4x2_t tapsVal, accumulator_val;
101  int16x4_t input16;
102  int32x4_t input32;
103  float32x4_t input_float, prod_re, prod_im;
104 
105  accumulator_val.val[0] = vdupq_n_f32(0.0);
106  accumulator_val.val[1] = vdupq_n_f32(0.0);
107 
108  for (ii = 0; ii < quarter_points; ++ii) {
109  tapsVal = vld2q_f32((float*)tapsPtr);
110  input16 = vld1_s16(inputPtr);
111  // widen 16-bit int to 32-bit int
112  input32 = vmovl_s16(input16);
113  // convert 32-bit int to float with scale
114  input_float = vcvtq_f32_s32(input32);
115 
116  prod_re = vmulq_f32(input_float, tapsVal.val[0]);
117  prod_im = vmulq_f32(input_float, tapsVal.val[1]);
118 
119  accumulator_val.val[0] = vaddq_f32(prod_re, accumulator_val.val[0]);
120  accumulator_val.val[1] = vaddq_f32(prod_im, accumulator_val.val[1]);
121 
122  tapsPtr += 4;
123  inputPtr += 4;
124  }
125  vst2q_f32((float*)accumulator_vec, accumulator_val);
126  accumulator_vec[0] += accumulator_vec[1];
127  accumulator_vec[2] += accumulator_vec[3];
128  accumulator_vec[0] += accumulator_vec[2];
129 
130  for (ii = quarter_points * 4; ii < num_points; ++ii) {
131  accumulator_vec[0] += *(tapsPtr++) * (float)(*(inputPtr++));
132  }
133 
134  *result = accumulator_vec[0];
135 }
136 
137 #endif /*LV_HAVE_NEON*/
138 
139 #if LV_HAVE_SSE && LV_HAVE_MMX
140 
141 static inline void volk_16i_32fc_dot_prod_32fc_u_sse(lv_32fc_t* result,
142  const short* input,
143  const lv_32fc_t* taps,
144  unsigned int num_points)
145 {
146 
147  unsigned int number = 0;
148  const unsigned int eighthPoints = num_points / 8;
149 
150  lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
151  const short* aPtr = input;
152  const float* bPtr = (float*)taps;
153 
154  __m64 m0, m1;
155  __m128 f0, f1, f2, f3;
156  __m128 a0Val, a1Val, a2Val, a3Val;
157  __m128 b0Val, b1Val, b2Val, b3Val;
158  __m128 c0Val, c1Val, c2Val, c3Val;
159 
160  __m128 dotProdVal0 = _mm_setzero_ps();
161  __m128 dotProdVal1 = _mm_setzero_ps();
162  __m128 dotProdVal2 = _mm_setzero_ps();
163  __m128 dotProdVal3 = _mm_setzero_ps();
164 
165  for (; number < eighthPoints; number++) {
166 
167  m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0));
168  m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4));
169  f0 = _mm_cvtpi16_ps(m0);
170  f1 = _mm_cvtpi16_ps(m0);
171  f2 = _mm_cvtpi16_ps(m1);
172  f3 = _mm_cvtpi16_ps(m1);
173 
174  a0Val = _mm_unpacklo_ps(f0, f1);
175  a1Val = _mm_unpackhi_ps(f0, f1);
176  a2Val = _mm_unpacklo_ps(f2, f3);
177  a3Val = _mm_unpackhi_ps(f2, f3);
178 
179  b0Val = _mm_loadu_ps(bPtr);
180  b1Val = _mm_loadu_ps(bPtr + 4);
181  b2Val = _mm_loadu_ps(bPtr + 8);
182  b3Val = _mm_loadu_ps(bPtr + 12);
183 
184  c0Val = _mm_mul_ps(a0Val, b0Val);
185  c1Val = _mm_mul_ps(a1Val, b1Val);
186  c2Val = _mm_mul_ps(a2Val, b2Val);
187  c3Val = _mm_mul_ps(a3Val, b3Val);
188 
189  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
190  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
191  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
192  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
193 
194  aPtr += 8;
195  bPtr += 16;
196  }
197 
198  _mm_empty(); // clear the mmx technology state
199 
200  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
201  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
202  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
203 
204  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
205 
206  _mm_store_ps(dotProductVector,
207  dotProdVal0); // Store the results back into the dot product vector
208 
209  returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
210  returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
211 
212  number = eighthPoints * 8;
213  for (; number < num_points; number++) {
214  returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
215  aPtr += 1;
216  bPtr += 2;
217  }
218 
219  *result = returnValue;
220 }
221 
222 #endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
223 
224 
225 #if LV_HAVE_AVX2 && LV_HAVE_FMA
226 
227 static inline void volk_16i_32fc_dot_prod_32fc_u_avx2_fma(lv_32fc_t* result,
228  const short* input,
229  const lv_32fc_t* taps,
230  unsigned int num_points)
231 {
232 
233  unsigned int number = 0;
234  const unsigned int sixteenthPoints = num_points / 16;
235 
236  lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
237  const short* aPtr = input;
238  const float* bPtr = (float*)taps;
239 
240  __m128i m0, m1;
241  __m256i f0, f1;
242  __m256 g0, g1, h0, h1, h2, h3;
243  __m256 a0Val, a1Val, a2Val, a3Val;
244  __m256 b0Val, b1Val, b2Val, b3Val;
245 
246  __m256 dotProdVal0 = _mm256_setzero_ps();
247  __m256 dotProdVal1 = _mm256_setzero_ps();
248  __m256 dotProdVal2 = _mm256_setzero_ps();
249  __m256 dotProdVal3 = _mm256_setzero_ps();
250 
251  for (; number < sixteenthPoints; number++) {
252 
253  m0 = _mm_loadu_si128((__m128i const*)aPtr);
254  m1 = _mm_loadu_si128((__m128i const*)(aPtr + 8));
255 
256  f0 = _mm256_cvtepi16_epi32(m0);
257  g0 = _mm256_cvtepi32_ps(f0);
258  f1 = _mm256_cvtepi16_epi32(m1);
259  g1 = _mm256_cvtepi32_ps(f1);
260 
261  h0 = _mm256_unpacklo_ps(g0, g0);
262  h1 = _mm256_unpackhi_ps(g0, g0);
263  h2 = _mm256_unpacklo_ps(g1, g1);
264  h3 = _mm256_unpackhi_ps(g1, g1);
265 
266  a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
267  a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
268  a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
269  a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
270 
271  b0Val = _mm256_loadu_ps(bPtr);
272  b1Val = _mm256_loadu_ps(bPtr + 8);
273  b2Val = _mm256_loadu_ps(bPtr + 16);
274  b3Val = _mm256_loadu_ps(bPtr + 24);
275 
276  dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
277  dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
278  dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
279  dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
280 
281  aPtr += 16;
282  bPtr += 32;
283  }
284 
285  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
286  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
287  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
288 
289  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
290 
291  _mm256_store_ps(dotProductVector,
292  dotProdVal0); // Store the results back into the dot product vector
293 
294  returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
295  returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
296  returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]);
297  returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]);
298 
299  number = sixteenthPoints * 16;
300  for (; number < num_points; number++) {
301  returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
302  aPtr += 1;
303  bPtr += 2;
304  }
305 
306  *result = returnValue;
307 }
308 
309 #endif /*LV_HAVE_AVX2 && lV_HAVE_FMA*/
310 
311 
312 #ifdef LV_HAVE_AVX2
313 
314 static inline void volk_16i_32fc_dot_prod_32fc_u_avx2(lv_32fc_t* result,
315  const short* input,
316  const lv_32fc_t* taps,
317  unsigned int num_points)
318 {
319 
320  unsigned int number = 0;
321  const unsigned int sixteenthPoints = num_points / 16;
322 
323  lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
324  const short* aPtr = input;
325  const float* bPtr = (float*)taps;
326 
327  __m128i m0, m1;
328  __m256i f0, f1;
329  __m256 g0, g1, h0, h1, h2, h3;
330  __m256 a0Val, a1Val, a2Val, a3Val;
331  __m256 b0Val, b1Val, b2Val, b3Val;
332  __m256 c0Val, c1Val, c2Val, c3Val;
333 
334  __m256 dotProdVal0 = _mm256_setzero_ps();
335  __m256 dotProdVal1 = _mm256_setzero_ps();
336  __m256 dotProdVal2 = _mm256_setzero_ps();
337  __m256 dotProdVal3 = _mm256_setzero_ps();
338 
339  for (; number < sixteenthPoints; number++) {
340 
341  m0 = _mm_loadu_si128((__m128i const*)aPtr);
342  m1 = _mm_loadu_si128((__m128i const*)(aPtr + 8));
343 
344  f0 = _mm256_cvtepi16_epi32(m0);
345  g0 = _mm256_cvtepi32_ps(f0);
346  f1 = _mm256_cvtepi16_epi32(m1);
347  g1 = _mm256_cvtepi32_ps(f1);
348 
349  h0 = _mm256_unpacklo_ps(g0, g0);
350  h1 = _mm256_unpackhi_ps(g0, g0);
351  h2 = _mm256_unpacklo_ps(g1, g1);
352  h3 = _mm256_unpackhi_ps(g1, g1);
353 
354  a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
355  a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
356  a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
357  a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
358 
359  b0Val = _mm256_loadu_ps(bPtr);
360  b1Val = _mm256_loadu_ps(bPtr + 8);
361  b2Val = _mm256_loadu_ps(bPtr + 16);
362  b3Val = _mm256_loadu_ps(bPtr + 24);
363 
364  c0Val = _mm256_mul_ps(a0Val, b0Val);
365  c1Val = _mm256_mul_ps(a1Val, b1Val);
366  c2Val = _mm256_mul_ps(a2Val, b2Val);
367  c3Val = _mm256_mul_ps(a3Val, b3Val);
368 
369  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
370  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
371  dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
372  dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
373 
374  aPtr += 16;
375  bPtr += 32;
376  }
377 
378  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
379  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
380  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
381 
382  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
383 
384  _mm256_store_ps(dotProductVector,
385  dotProdVal0); // Store the results back into the dot product vector
386 
387  returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
388  returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
389  returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]);
390  returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]);
391 
392  number = sixteenthPoints * 16;
393  for (; number < num_points; number++) {
394  returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
395  aPtr += 1;
396  bPtr += 2;
397  }
398 
399  *result = returnValue;
400 }
401 
402 #endif /*LV_HAVE_AVX2*/
403 
404 
405 #if LV_HAVE_SSE && LV_HAVE_MMX
406 
407 
408 static inline void volk_16i_32fc_dot_prod_32fc_a_sse(lv_32fc_t* result,
409  const short* input,
410  const lv_32fc_t* taps,
411  unsigned int num_points)
412 {
413 
414  unsigned int number = 0;
415  const unsigned int eighthPoints = num_points / 8;
416 
417  lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
418  const short* aPtr = input;
419  const float* bPtr = (float*)taps;
420 
421  __m64 m0, m1;
422  __m128 f0, f1, f2, f3;
423  __m128 a0Val, a1Val, a2Val, a3Val;
424  __m128 b0Val, b1Val, b2Val, b3Val;
425  __m128 c0Val, c1Val, c2Val, c3Val;
426 
427  __m128 dotProdVal0 = _mm_setzero_ps();
428  __m128 dotProdVal1 = _mm_setzero_ps();
429  __m128 dotProdVal2 = _mm_setzero_ps();
430  __m128 dotProdVal3 = _mm_setzero_ps();
431 
432  for (; number < eighthPoints; number++) {
433 
434  m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0));
435  m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4));
436  f0 = _mm_cvtpi16_ps(m0);
437  f1 = _mm_cvtpi16_ps(m0);
438  f2 = _mm_cvtpi16_ps(m1);
439  f3 = _mm_cvtpi16_ps(m1);
440 
441  a0Val = _mm_unpacklo_ps(f0, f1);
442  a1Val = _mm_unpackhi_ps(f0, f1);
443  a2Val = _mm_unpacklo_ps(f2, f3);
444  a3Val = _mm_unpackhi_ps(f2, f3);
445 
446  b0Val = _mm_load_ps(bPtr);
447  b1Val = _mm_load_ps(bPtr + 4);
448  b2Val = _mm_load_ps(bPtr + 8);
449  b3Val = _mm_load_ps(bPtr + 12);
450 
451  c0Val = _mm_mul_ps(a0Val, b0Val);
452  c1Val = _mm_mul_ps(a1Val, b1Val);
453  c2Val = _mm_mul_ps(a2Val, b2Val);
454  c3Val = _mm_mul_ps(a3Val, b3Val);
455 
456  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
457  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
458  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
459  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
460 
461  aPtr += 8;
462  bPtr += 16;
463  }
464 
465  _mm_empty(); // clear the mmx technology state
466 
467  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
468  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
469  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
470 
471  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
472 
473  _mm_store_ps(dotProductVector,
474  dotProdVal0); // Store the results back into the dot product vector
475 
476  returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
477  returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
478 
479  number = eighthPoints * 8;
480  for (; number < num_points; number++) {
481  returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
482  aPtr += 1;
483  bPtr += 2;
484  }
485 
486  *result = returnValue;
487 }
488 
489 #endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
490 
491 #ifdef LV_HAVE_AVX2
492 
493 static inline void volk_16i_32fc_dot_prod_32fc_a_avx2(lv_32fc_t* result,
494  const short* input,
495  const lv_32fc_t* taps,
496  unsigned int num_points)
497 {
498 
499  unsigned int number = 0;
500  const unsigned int sixteenthPoints = num_points / 16;
501 
502  lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
503  const short* aPtr = input;
504  const float* bPtr = (float*)taps;
505 
506  __m128i m0, m1;
507  __m256i f0, f1;
508  __m256 g0, g1, h0, h1, h2, h3;
509  __m256 a0Val, a1Val, a2Val, a3Val;
510  __m256 b0Val, b1Val, b2Val, b3Val;
511  __m256 c0Val, c1Val, c2Val, c3Val;
512 
513  __m256 dotProdVal0 = _mm256_setzero_ps();
514  __m256 dotProdVal1 = _mm256_setzero_ps();
515  __m256 dotProdVal2 = _mm256_setzero_ps();
516  __m256 dotProdVal3 = _mm256_setzero_ps();
517 
518  for (; number < sixteenthPoints; number++) {
519 
520  m0 = _mm_load_si128((__m128i const*)aPtr);
521  m1 = _mm_load_si128((__m128i const*)(aPtr + 8));
522 
523  f0 = _mm256_cvtepi16_epi32(m0);
524  g0 = _mm256_cvtepi32_ps(f0);
525  f1 = _mm256_cvtepi16_epi32(m1);
526  g1 = _mm256_cvtepi32_ps(f1);
527 
528  h0 = _mm256_unpacklo_ps(g0, g0);
529  h1 = _mm256_unpackhi_ps(g0, g0);
530  h2 = _mm256_unpacklo_ps(g1, g1);
531  h3 = _mm256_unpackhi_ps(g1, g1);
532 
533  a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
534  a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
535  a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
536  a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
537 
538  b0Val = _mm256_load_ps(bPtr);
539  b1Val = _mm256_load_ps(bPtr + 8);
540  b2Val = _mm256_load_ps(bPtr + 16);
541  b3Val = _mm256_load_ps(bPtr + 24);
542 
543  c0Val = _mm256_mul_ps(a0Val, b0Val);
544  c1Val = _mm256_mul_ps(a1Val, b1Val);
545  c2Val = _mm256_mul_ps(a2Val, b2Val);
546  c3Val = _mm256_mul_ps(a3Val, b3Val);
547 
548  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
549  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
550  dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
551  dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
552 
553  aPtr += 16;
554  bPtr += 32;
555  }
556 
557  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
558  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
559  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
560 
561  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
562 
563  _mm256_store_ps(dotProductVector,
564  dotProdVal0); // Store the results back into the dot product vector
565 
566  returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
567  returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
568  returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]);
569  returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]);
570 
571  number = sixteenthPoints * 16;
572  for (; number < num_points; number++) {
573  returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
574  aPtr += 1;
575  bPtr += 2;
576  }
577 
578  *result = returnValue;
579 }
580 
581 
582 #endif /*LV_HAVE_AVX2*/
583 
584 #if LV_HAVE_AVX2 && LV_HAVE_FMA
585 
586 static inline void volk_16i_32fc_dot_prod_32fc_a_avx2_fma(lv_32fc_t* result,
587  const short* input,
588  const lv_32fc_t* taps,
589  unsigned int num_points)
590 {
591 
592  unsigned int number = 0;
593  const unsigned int sixteenthPoints = num_points / 16;
594 
595  lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
596  const short* aPtr = input;
597  const float* bPtr = (float*)taps;
598 
599  __m128i m0, m1;
600  __m256i f0, f1;
601  __m256 g0, g1, h0, h1, h2, h3;
602  __m256 a0Val, a1Val, a2Val, a3Val;
603  __m256 b0Val, b1Val, b2Val, b3Val;
604 
605  __m256 dotProdVal0 = _mm256_setzero_ps();
606  __m256 dotProdVal1 = _mm256_setzero_ps();
607  __m256 dotProdVal2 = _mm256_setzero_ps();
608  __m256 dotProdVal3 = _mm256_setzero_ps();
609 
610  for (; number < sixteenthPoints; number++) {
611 
612  m0 = _mm_load_si128((__m128i const*)aPtr);
613  m1 = _mm_load_si128((__m128i const*)(aPtr + 8));
614 
615  f0 = _mm256_cvtepi16_epi32(m0);
616  g0 = _mm256_cvtepi32_ps(f0);
617  f1 = _mm256_cvtepi16_epi32(m1);
618  g1 = _mm256_cvtepi32_ps(f1);
619 
620  h0 = _mm256_unpacklo_ps(g0, g0);
621  h1 = _mm256_unpackhi_ps(g0, g0);
622  h2 = _mm256_unpacklo_ps(g1, g1);
623  h3 = _mm256_unpackhi_ps(g1, g1);
624 
625  a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
626  a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
627  a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
628  a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
629 
630  b0Val = _mm256_load_ps(bPtr);
631  b1Val = _mm256_load_ps(bPtr + 8);
632  b2Val = _mm256_load_ps(bPtr + 16);
633  b3Val = _mm256_load_ps(bPtr + 24);
634 
635  dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
636  dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
637  dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
638  dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
639 
640  aPtr += 16;
641  bPtr += 32;
642  }
643 
644  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
645  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
646  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
647 
648  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
649 
650  _mm256_store_ps(dotProductVector,
651  dotProdVal0); // Store the results back into the dot product vector
652 
653  returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
654  returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
655  returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]);
656  returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]);
657 
658  number = sixteenthPoints * 16;
659  for (; number < num_points; number++) {
660  returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
661  aPtr += 1;
662  bPtr += 2;
663  }
664 
665  *result = returnValue;
666 }
667 
668 
669 #endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
670 
671 #ifdef LV_HAVE_RVV
672 #include <riscv_vector.h>
674 
675 static inline void volk_16i_32fc_dot_prod_32fc_rvv(lv_32fc_t* result,
676  const short* input,
677  const lv_32fc_t* taps,
678  unsigned int num_points)
679 {
680  vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
681  vfloat32m4_t vsumi = vsumr;
682  size_t n = num_points;
683  for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
684  vl = __riscv_vsetvl_e32m4(n);
685  vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)taps, vl);
686  vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl));
687  vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl));
688  vfloat32m4_t v =
689  __riscv_vfwcvt_f(__riscv_vle16_v_i16m2((const int16_t*)input, vl), vl);
690  vsumr = __riscv_vfmacc_tu(vsumr, vr, v, vl);
691  vsumi = __riscv_vfmacc_tu(vsumi, vi, v, vl);
692  }
693  size_t vl = __riscv_vsetvlmax_e32m1();
694  vfloat32m1_t vr = RISCV_SHRINK4(vfadd, f, 32, vsumr);
695  vfloat32m1_t vi = RISCV_SHRINK4(vfadd, f, 32, vsumi);
696  vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
697  *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
698  __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));
699 }
700 #endif /*LV_HAVE_RVV*/
701 
702 #ifdef LV_HAVE_RVVSEG
703 #include <riscv_vector.h>
705 
706 static inline void volk_16i_32fc_dot_prod_32fc_rvvseg(lv_32fc_t* result,
707  const short* input,
708  const lv_32fc_t* taps,
709  unsigned int num_points)
710 {
711  vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
712  vfloat32m4_t vsumi = vsumr;
713  size_t n = num_points;
714  for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
715  vl = __riscv_vsetvl_e32m4(n);
716  vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)taps, vl);
717  vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0);
718  vfloat32m4_t vi = __riscv_vget_f32m4(vc, 1);
719  vfloat32m4_t v =
720  __riscv_vfwcvt_f(__riscv_vle16_v_i16m2((const int16_t*)input, vl), vl);
721  vsumr = __riscv_vfmacc_tu(vsumr, vr, v, vl);
722  vsumi = __riscv_vfmacc_tu(vsumi, vi, v, vl);
723  }
724  size_t vl = __riscv_vsetvlmax_e32m1();
725  vfloat32m1_t vr = RISCV_SHRINK4(vfadd, f, 32, vsumr);
726  vfloat32m1_t vi = RISCV_SHRINK4(vfadd, f, 32, vsumi);
727  vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
728  *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
729  __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));
730 }
731 #endif /*LV_HAVE_RVVSEG*/
732 
733 #endif /*INCLUDED_volk_16i_32fc_dot_prod_32fc_H*/
static void volk_16i_32fc_dot_prod_32fc_neon(lv_32fc_t *result, const short *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_16i_32fc_dot_prod_32fc.h:88
static void volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t *result, const short *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_16i_32fc_dot_prod_32fc.h:54
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:62
#define lv_cmake(r, i)
Definition: volk_complex.h:77
float complex lv_32fc_t
Definition: volk_complex.h:74
for i
Definition: volk_config_fixed.tmpl.h:13
#define RISCV_SHRINK4(op, T, S, v)
Definition: volk_rvv_intrinsics.h:24