Vector Optimized Library of Kernels  3.2.0
Architecture-tuned implementations of math kernels
volk_32fc_x2_dot_prod_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2013, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
45 #ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
46 #define INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
47 
48 #include <stdio.h>
49 #include <string.h>
50 #include <volk/volk_common.h>
51 #include <volk/volk_complex.h>
52 
53 
54 #ifdef LV_HAVE_RISCV64
55 extern void volk_32fc_x2_dot_prod_32fc_sifive_u74(lv_32fc_t* result,
56  const lv_32fc_t* input,
57  const lv_32fc_t* taps,
58  unsigned int num_points);
59 #endif
60 
61 #ifdef LV_HAVE_GENERIC
62 
63 
64 static inline void volk_32fc_x2_dot_prod_32fc_generic(lv_32fc_t* result,
65  const lv_32fc_t* input,
66  const lv_32fc_t* taps,
67  unsigned int num_points)
68 {
69 
70  float* res = (float*)result;
71  float* in = (float*)input;
72  float* tp = (float*)taps;
73  unsigned int n_2_ccomplex_blocks = num_points / 2;
74 
75  float sum0[2] = { 0, 0 };
76  float sum1[2] = { 0, 0 };
77  unsigned int i = 0;
78 
79  for (i = 0; i < n_2_ccomplex_blocks; ++i) {
80  sum0[0] += in[0] * tp[0] - in[1] * tp[1];
81  sum0[1] += in[0] * tp[1] + in[1] * tp[0];
82  sum1[0] += in[2] * tp[2] - in[3] * tp[3];
83  sum1[1] += in[2] * tp[3] + in[3] * tp[2];
84 
85  in += 4;
86  tp += 4;
87  }
88 
89  res[0] = sum0[0] + sum1[0];
90  res[1] = sum0[1] + sum1[1];
91 
92  // Cleanup if we had an odd number of points
93  if (num_points & 1) {
94  *result += input[num_points - 1] * taps[num_points - 1];
95  }
96 }
97 
98 #endif /*LV_HAVE_GENERIC*/
99 
100 
101 #ifdef LV_HAVE_SSE3
102 
103 #include <pmmintrin.h>
104 
105 static inline void volk_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result,
106  const lv_32fc_t* input,
107  const lv_32fc_t* taps,
108  unsigned int num_points)
109 {
110 
111  lv_32fc_t dotProduct;
112  memset(&dotProduct, 0x0, 2 * sizeof(float));
113 
114  unsigned int number = 0;
115  const unsigned int halfPoints = num_points / 2;
116  unsigned int isodd = num_points & 1;
117 
118  __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
119 
120  const lv_32fc_t* a = input;
121  const lv_32fc_t* b = taps;
122 
123  dotProdVal = _mm_setzero_ps();
124 
125  for (; number < halfPoints; number++) {
126 
127  x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
128  y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
129 
130  yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
131  yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
132 
133  tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
134 
135  x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
136 
137  tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
138 
139  z = _mm_addsub_ps(tmp1,
140  tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
141 
142  dotProdVal =
143  _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
144 
145  a += 2;
146  b += 2;
147  }
148 
149  __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
150 
151  _mm_storeu_ps((float*)dotProductVector,
152  dotProdVal); // Store the results back into the dot product vector
153 
154  dotProduct += (dotProductVector[0] + dotProductVector[1]);
155 
156  if (isodd) {
157  dotProduct += input[num_points - 1] * taps[num_points - 1];
158  }
159 
160  *result = dotProduct;
161 }
162 
163 #endif /*LV_HAVE_SSE3*/
164 
165 #ifdef LV_HAVE_AVX
166 
167 #include <immintrin.h>
168 
169 static inline void volk_32fc_x2_dot_prod_32fc_u_avx(lv_32fc_t* result,
170  const lv_32fc_t* input,
171  const lv_32fc_t* taps,
172  unsigned int num_points)
173 {
174 
175  unsigned int isodd = num_points & 3;
176  unsigned int i = 0;
177  lv_32fc_t dotProduct;
178  memset(&dotProduct, 0x0, 2 * sizeof(float));
179 
180  unsigned int number = 0;
181  const unsigned int quarterPoints = num_points / 4;
182 
183  __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
184 
185  const lv_32fc_t* a = input;
186  const lv_32fc_t* b = taps;
187 
188  dotProdVal = _mm256_setzero_ps();
189 
190  for (; number < quarterPoints; number++) {
191  x = _mm256_loadu_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
192  y = _mm256_loadu_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
193 
194  yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
195  yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
196 
197  tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
198 
199  x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
200 
201  tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
202 
203  z = _mm256_addsub_ps(tmp1,
204  tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
205 
206  dotProdVal = _mm256_add_ps(dotProdVal,
207  z); // Add the complex multiplication results together
208 
209  a += 4;
210  b += 4;
211  }
212 
213  __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
214 
215  _mm256_storeu_ps((float*)dotProductVector,
216  dotProdVal); // Store the results back into the dot product vector
217 
218  dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
219  dotProductVector[3]);
220 
221  for (i = num_points - isodd; i < num_points; i++) {
222  dotProduct += input[i] * taps[i];
223  }
224 
225  *result = dotProduct;
226 }
227 
228 #endif /*LV_HAVE_AVX*/
229 
230 #if LV_HAVE_AVX && LV_HAVE_FMA
231 #include <immintrin.h>
232 
233 static inline void volk_32fc_x2_dot_prod_32fc_u_avx_fma(lv_32fc_t* result,
234  const lv_32fc_t* input,
235  const lv_32fc_t* taps,
236  unsigned int num_points)
237 {
238 
239  unsigned int isodd = num_points & 3;
240  unsigned int i = 0;
241  lv_32fc_t dotProduct;
242  memset(&dotProduct, 0x0, 2 * sizeof(float));
243 
244  unsigned int number = 0;
245  const unsigned int quarterPoints = num_points / 4;
246 
247  __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
248 
249  const lv_32fc_t* a = input;
250  const lv_32fc_t* b = taps;
251 
252  dotProdVal = _mm256_setzero_ps();
253 
254  for (; number < quarterPoints; number++) {
255 
256  x = _mm256_loadu_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
257  y = _mm256_loadu_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
258 
259  yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
260  yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
261 
262  tmp1 = x;
263 
264  x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
265 
266  tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
267 
268  z = _mm256_fmaddsub_ps(
269  tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
270 
271  dotProdVal = _mm256_add_ps(dotProdVal,
272  z); // Add the complex multiplication results together
273 
274  a += 4;
275  b += 4;
276  }
277 
278  __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
279 
280  _mm256_storeu_ps((float*)dotProductVector,
281  dotProdVal); // Store the results back into the dot product vector
282 
283  dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
284  dotProductVector[3]);
285 
286  for (i = num_points - isodd; i < num_points; i++) {
287  dotProduct += input[i] * taps[i];
288  }
289 
290  *result = dotProduct;
291 }
292 
293 #endif /*LV_HAVE_AVX && LV_HAVE_FMA*/
294 
295 #endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H*/
296 
297 #ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
298 #define INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
299 
300 #include <stdio.h>
301 #include <string.h>
302 #include <volk/volk_common.h>
303 #include <volk/volk_complex.h>
304 
305 
306 #ifdef LV_HAVE_SSE3
307 
308 #include <pmmintrin.h>
309 
310 static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result,
311  const lv_32fc_t* input,
312  const lv_32fc_t* taps,
313  unsigned int num_points)
314 {
315 
316  const unsigned int num_bytes = num_points * 8;
317  unsigned int isodd = num_points & 1;
318 
319  lv_32fc_t dotProduct;
320  memset(&dotProduct, 0x0, 2 * sizeof(float));
321 
322  unsigned int number = 0;
323  const unsigned int halfPoints = num_bytes >> 4;
324 
325  __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
326 
327  const lv_32fc_t* a = input;
328  const lv_32fc_t* b = taps;
329 
330  dotProdVal = _mm_setzero_ps();
331 
332  for (; number < halfPoints; number++) {
333 
334  x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
335  y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
336 
337  yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
338  yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
339 
340  tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
341 
342  x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
343 
344  tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
345 
346  z = _mm_addsub_ps(tmp1,
347  tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
348 
349  dotProdVal =
350  _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
351 
352  a += 2;
353  b += 2;
354  }
355 
356  __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
357 
358  _mm_store_ps((float*)dotProductVector,
359  dotProdVal); // Store the results back into the dot product vector
360 
361  dotProduct += (dotProductVector[0] + dotProductVector[1]);
362 
363  if (isodd) {
364  dotProduct += input[num_points - 1] * taps[num_points - 1];
365  }
366 
367  *result = dotProduct;
368 }
369 
370 #endif /*LV_HAVE_SSE3*/
371 
372 
373 #ifdef LV_HAVE_NEON
374 #include <arm_neon.h>
375 
376 static inline void volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t* result,
377  const lv_32fc_t* input,
378  const lv_32fc_t* taps,
379  unsigned int num_points)
380 {
381 
382  unsigned int quarter_points = num_points / 4;
383  unsigned int number;
384 
385  lv_32fc_t* a_ptr = (lv_32fc_t*)taps;
386  lv_32fc_t* b_ptr = (lv_32fc_t*)input;
387  // for 2-lane vectors, 1st lane holds the real part,
388  // 2nd lane holds the imaginary part
389  float32x4x2_t a_val, b_val, c_val, accumulator;
390  float32x4x2_t tmp_real, tmp_imag;
391  accumulator.val[0] = vdupq_n_f32(0);
392  accumulator.val[1] = vdupq_n_f32(0);
393 
394  for (number = 0; number < quarter_points; ++number) {
395  a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
396  b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
397  __VOLK_PREFETCH(a_ptr + 8);
398  __VOLK_PREFETCH(b_ptr + 8);
399 
400  // multiply the real*real and imag*imag to get real result
401  // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
402  tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
403  // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
404  tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
405 
406  // Multiply cross terms to get the imaginary result
407  // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
408  tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
409  // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
410  tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
411 
412  c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
413  c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
414 
415  accumulator.val[0] = vaddq_f32(accumulator.val[0], c_val.val[0]);
416  accumulator.val[1] = vaddq_f32(accumulator.val[1], c_val.val[1]);
417 
418  a_ptr += 4;
419  b_ptr += 4;
420  }
421  lv_32fc_t accum_result[4];
422  vst2q_f32((float*)accum_result, accumulator);
423  *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
424 
425  // tail case
426  for (number = quarter_points * 4; number < num_points; ++number) {
427  *result += (*a_ptr++) * (*b_ptr++);
428  }
429 }
430 #endif /*LV_HAVE_NEON*/
431 
432 #ifdef LV_HAVE_NEON
433 #include <arm_neon.h>
435  const lv_32fc_t* input,
436  const lv_32fc_t* taps,
437  unsigned int num_points)
438 {
439 
440  unsigned int quarter_points = num_points / 4;
441  unsigned int number;
442 
443  lv_32fc_t* a_ptr = (lv_32fc_t*)taps;
444  lv_32fc_t* b_ptr = (lv_32fc_t*)input;
445  // for 2-lane vectors, 1st lane holds the real part,
446  // 2nd lane holds the imaginary part
447  float32x4x2_t a_val, b_val, accumulator;
448  float32x4x2_t tmp_imag;
449  accumulator.val[0] = vdupq_n_f32(0);
450  accumulator.val[1] = vdupq_n_f32(0);
451 
452  for (number = 0; number < quarter_points; ++number) {
453  a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
454  b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
455  __VOLK_PREFETCH(a_ptr + 8);
456  __VOLK_PREFETCH(b_ptr + 8);
457 
458  // do the first multiply
459  tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
460  tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
461 
462  // use multiply accumulate/subtract to get result
463  tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
464  tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
465 
466  accumulator.val[0] = vaddq_f32(accumulator.val[0], tmp_imag.val[0]);
467  accumulator.val[1] = vaddq_f32(accumulator.val[1], tmp_imag.val[1]);
468 
469  // increment pointers
470  a_ptr += 4;
471  b_ptr += 4;
472  }
473  lv_32fc_t accum_result[4];
474  vst2q_f32((float*)accum_result, accumulator);
475  *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
476 
477  // tail case
478  for (number = quarter_points * 4; number < num_points; ++number) {
479  *result += (*a_ptr++) * (*b_ptr++);
480  }
481 }
482 #endif /*LV_HAVE_NEON*/
483 
484 #ifdef LV_HAVE_NEON
486  const lv_32fc_t* input,
487  const lv_32fc_t* taps,
488  unsigned int num_points)
489 {
490 
491  unsigned int quarter_points = num_points / 4;
492  unsigned int number;
493 
494  lv_32fc_t* a_ptr = (lv_32fc_t*)taps;
495  lv_32fc_t* b_ptr = (lv_32fc_t*)input;
496  // for 2-lane vectors, 1st lane holds the real part,
497  // 2nd lane holds the imaginary part
498  float32x4x2_t a_val, b_val, accumulator1, accumulator2;
499  accumulator1.val[0] = vdupq_n_f32(0);
500  accumulator1.val[1] = vdupq_n_f32(0);
501  accumulator2.val[0] = vdupq_n_f32(0);
502  accumulator2.val[1] = vdupq_n_f32(0);
503 
504  for (number = 0; number < quarter_points; ++number) {
505  a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
506  b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
507  __VOLK_PREFETCH(a_ptr + 8);
508  __VOLK_PREFETCH(b_ptr + 8);
509 
510  // use 2 accumulators to remove inter-instruction data dependencies
511  accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
512  accumulator1.val[1] = vmlaq_f32(accumulator1.val[1], a_val.val[0], b_val.val[1]);
513  accumulator2.val[0] = vmlsq_f32(accumulator2.val[0], a_val.val[1], b_val.val[1]);
514  accumulator2.val[1] = vmlaq_f32(accumulator2.val[1], a_val.val[1], b_val.val[0]);
515  // increment pointers
516  a_ptr += 4;
517  b_ptr += 4;
518  }
519  accumulator1.val[0] = vaddq_f32(accumulator1.val[0], accumulator2.val[0]);
520  accumulator1.val[1] = vaddq_f32(accumulator1.val[1], accumulator2.val[1]);
521  lv_32fc_t accum_result[4];
522  vst2q_f32((float*)accum_result, accumulator1);
523  *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
524 
525  // tail case
526  for (number = quarter_points * 4; number < num_points; ++number) {
527  *result += (*a_ptr++) * (*b_ptr++);
528  }
529 }
530 #endif /*LV_HAVE_NEON*/
531 
532 #ifdef LV_HAVE_NEON
534  const lv_32fc_t* input,
535  const lv_32fc_t* taps,
536  unsigned int num_points)
537 {
538  // NOTE: GCC does a poor job with this kernel, but the equivalent ASM code is very
539  // fast
540 
541  unsigned int quarter_points = num_points / 8;
542  unsigned int number;
543 
544  lv_32fc_t* a_ptr = (lv_32fc_t*)taps;
545  lv_32fc_t* b_ptr = (lv_32fc_t*)input;
546  // for 2-lane vectors, 1st lane holds the real part,
547  // 2nd lane holds the imaginary part
548  float32x4x4_t a_val, b_val, accumulator1, accumulator2;
549  float32x4x2_t reduced_accumulator;
550  accumulator1.val[0] = vdupq_n_f32(0);
551  accumulator1.val[1] = vdupq_n_f32(0);
552  accumulator1.val[2] = vdupq_n_f32(0);
553  accumulator1.val[3] = vdupq_n_f32(0);
554  accumulator2.val[0] = vdupq_n_f32(0);
555  accumulator2.val[1] = vdupq_n_f32(0);
556  accumulator2.val[2] = vdupq_n_f32(0);
557  accumulator2.val[3] = vdupq_n_f32(0);
558 
559  // 8 input regs, 8 accumulators -> 16/16 neon regs are used
560  for (number = 0; number < quarter_points; ++number) {
561  a_val = vld4q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
562  b_val = vld4q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
563  __VOLK_PREFETCH(a_ptr + 8);
564  __VOLK_PREFETCH(b_ptr + 8);
565 
566  // use 2 accumulators to remove inter-instruction data dependencies
567  accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
568  accumulator1.val[1] = vmlaq_f32(accumulator1.val[1], a_val.val[0], b_val.val[1]);
569 
570  accumulator1.val[2] = vmlaq_f32(accumulator1.val[2], a_val.val[2], b_val.val[2]);
571  accumulator1.val[3] = vmlaq_f32(accumulator1.val[3], a_val.val[2], b_val.val[3]);
572 
573  accumulator2.val[0] = vmlsq_f32(accumulator2.val[0], a_val.val[1], b_val.val[1]);
574  accumulator2.val[1] = vmlaq_f32(accumulator2.val[1], a_val.val[1], b_val.val[0]);
575 
576  accumulator2.val[2] = vmlsq_f32(accumulator2.val[2], a_val.val[3], b_val.val[3]);
577  accumulator2.val[3] = vmlaq_f32(accumulator2.val[3], a_val.val[3], b_val.val[2]);
578  // increment pointers
579  a_ptr += 8;
580  b_ptr += 8;
581  }
582  // reduce 8 accumulator lanes down to 2 (1 real and 1 imag)
583  accumulator1.val[0] = vaddq_f32(accumulator1.val[0], accumulator1.val[2]);
584  accumulator1.val[1] = vaddq_f32(accumulator1.val[1], accumulator1.val[3]);
585  accumulator2.val[0] = vaddq_f32(accumulator2.val[0], accumulator2.val[2]);
586  accumulator2.val[1] = vaddq_f32(accumulator2.val[1], accumulator2.val[3]);
587  reduced_accumulator.val[0] = vaddq_f32(accumulator1.val[0], accumulator2.val[0]);
588  reduced_accumulator.val[1] = vaddq_f32(accumulator1.val[1], accumulator2.val[1]);
589  // now reduce accumulators to scalars
590  lv_32fc_t accum_result[4];
591  vst2q_f32((float*)accum_result, reduced_accumulator);
592  *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
593 
594  // tail case
595  for (number = quarter_points * 8; number < num_points; ++number) {
596  *result += (*a_ptr++) * (*b_ptr++);
597  }
598 }
599 #endif /*LV_HAVE_NEON*/
600 
601 
602 #ifdef LV_HAVE_AVX
603 
604 #include <immintrin.h>
605 
606 static inline void volk_32fc_x2_dot_prod_32fc_a_avx(lv_32fc_t* result,
607  const lv_32fc_t* input,
608  const lv_32fc_t* taps,
609  unsigned int num_points)
610 {
611 
612  unsigned int isodd = num_points & 3;
613  unsigned int i = 0;
614  lv_32fc_t dotProduct;
615  memset(&dotProduct, 0x0, 2 * sizeof(float));
616 
617  unsigned int number = 0;
618  const unsigned int quarterPoints = num_points / 4;
619 
620  __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
621 
622  const lv_32fc_t* a = input;
623  const lv_32fc_t* b = taps;
624 
625  dotProdVal = _mm256_setzero_ps();
626 
627  for (; number < quarterPoints; number++) {
628 
629  x = _mm256_load_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
630  y = _mm256_load_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
631 
632  yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
633  yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
634 
635  tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
636 
637  x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
638 
639  tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
640 
641  z = _mm256_addsub_ps(tmp1,
642  tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
643 
644  dotProdVal = _mm256_add_ps(dotProdVal,
645  z); // Add the complex multiplication results together
646 
647  a += 4;
648  b += 4;
649  }
650 
651  __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
652 
653  _mm256_store_ps((float*)dotProductVector,
654  dotProdVal); // Store the results back into the dot product vector
655 
656  dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
657  dotProductVector[3]);
658 
659  for (i = num_points - isodd; i < num_points; i++) {
660  dotProduct += input[i] * taps[i];
661  }
662 
663  *result = dotProduct;
664 }
665 
666 #endif /*LV_HAVE_AVX*/
667 
668 #if LV_HAVE_AVX && LV_HAVE_FMA
669 #include <immintrin.h>
670 
671 static inline void volk_32fc_x2_dot_prod_32fc_a_avx_fma(lv_32fc_t* result,
672  const lv_32fc_t* input,
673  const lv_32fc_t* taps,
674  unsigned int num_points)
675 {
676 
677  unsigned int isodd = num_points & 3;
678  unsigned int i = 0;
679  lv_32fc_t dotProduct;
680  memset(&dotProduct, 0x0, 2 * sizeof(float));
681 
682  unsigned int number = 0;
683  const unsigned int quarterPoints = num_points / 4;
684 
685  __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
686 
687  const lv_32fc_t* a = input;
688  const lv_32fc_t* b = taps;
689 
690  dotProdVal = _mm256_setzero_ps();
691 
692  for (; number < quarterPoints; number++) {
693 
694  x = _mm256_load_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
695  y = _mm256_load_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
696 
697  yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
698  yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
699 
700  tmp1 = x;
701 
702  x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
703 
704  tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
705 
706  z = _mm256_fmaddsub_ps(
707  tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
708 
709  dotProdVal = _mm256_add_ps(dotProdVal,
710  z); // Add the complex multiplication results together
711 
712  a += 4;
713  b += 4;
714  }
715 
716  __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
717 
718  _mm256_store_ps((float*)dotProductVector,
719  dotProdVal); // Store the results back into the dot product vector
720 
721  dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
722  dotProductVector[3]);
723 
724  for (i = num_points - isodd; i < num_points; i++) {
725  dotProduct += input[i] * taps[i];
726  }
727 
728  *result = dotProduct;
729 }
730 
731 #endif /*LV_HAVE_AVX && LV_HAVE_FMA*/
732 
733 #ifdef LV_HAVE_RVV
734 #include <riscv_vector.h>
736 
737 static inline void volk_32fc_x2_dot_prod_32fc_rvv(lv_32fc_t* result,
738  const lv_32fc_t* input,
739  const lv_32fc_t* taps,
740  unsigned int num_points)
741 {
742  vfloat32m2_t vsumr = __riscv_vfmv_v_f_f32m2(0, __riscv_vsetvlmax_e32m2());
743  vfloat32m2_t vsumi = vsumr;
744  size_t n = num_points;
745  for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
746  vl = __riscv_vsetvl_e32m2(n);
747  vuint64m4_t va = __riscv_vle64_v_u64m4((const uint64_t*)input, vl);
748  vuint64m4_t vb = __riscv_vle64_v_u64m4((const uint64_t*)taps, vl);
749  vfloat32m2_t var = __riscv_vreinterpret_f32m2(__riscv_vnsrl(va, 0, vl));
750  vfloat32m2_t vbr = __riscv_vreinterpret_f32m2(__riscv_vnsrl(vb, 0, vl));
751  vfloat32m2_t vai = __riscv_vreinterpret_f32m2(__riscv_vnsrl(va, 32, vl));
752  vfloat32m2_t vbi = __riscv_vreinterpret_f32m2(__riscv_vnsrl(vb, 32, vl));
753  vfloat32m2_t vr = __riscv_vfnmsac(__riscv_vfmul(var, vbr, vl), vai, vbi, vl);
754  vfloat32m2_t vi = __riscv_vfmacc(__riscv_vfmul(var, vbi, vl), vai, vbr, vl);
755  vsumr = __riscv_vfadd_tu(vsumr, vsumr, vr, vl);
756  vsumi = __riscv_vfadd_tu(vsumi, vsumi, vi, vl);
757  }
758  size_t vl = __riscv_vsetvlmax_e32m1();
759  vfloat32m1_t vr = RISCV_SHRINK2(vfadd, f, 32, vsumr);
760  vfloat32m1_t vi = RISCV_SHRINK2(vfadd, f, 32, vsumi);
761  vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
762  *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
763  __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));
764 }
765 #endif /*LV_HAVE_RVV*/
766 
767 #ifdef LV_HAVE_RVVSEG
768 #include <riscv_vector.h>
770 
771 static inline void volk_32fc_x2_dot_prod_32fc_rvvseg(lv_32fc_t* result,
772  const lv_32fc_t* input,
773  const lv_32fc_t* taps,
774  unsigned int num_points)
775 {
776  vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
777  vfloat32m4_t vsumi = vsumr;
778  size_t n = num_points;
779  for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
780  vl = __riscv_vsetvl_e32m4(n);
781  vfloat32m4x2_t va = __riscv_vlseg2e32_v_f32m4x2((const float*)input, vl);
782  vfloat32m4x2_t vb = __riscv_vlseg2e32_v_f32m4x2((const float*)taps, vl);
783  vfloat32m4_t var = __riscv_vget_f32m4(va, 0), vai = __riscv_vget_f32m4(va, 1);
784  vfloat32m4_t vbr = __riscv_vget_f32m4(vb, 0), vbi = __riscv_vget_f32m4(vb, 1);
785  vfloat32m4_t vr = __riscv_vfnmsac(__riscv_vfmul(var, vbr, vl), vai, vbi, vl);
786  vfloat32m4_t vi = __riscv_vfmacc(__riscv_vfmul(var, vbi, vl), vai, vbr, vl);
787  vsumr = __riscv_vfadd_tu(vsumr, vsumr, vr, vl);
788  vsumi = __riscv_vfadd_tu(vsumi, vsumi, vi, vl);
789  }
790  size_t vl = __riscv_vsetvlmax_e32m1();
791  vfloat32m1_t vr = RISCV_SHRINK4(vfadd, f, 32, vsumr);
792  vfloat32m1_t vi = RISCV_SHRINK4(vfadd, f, 32, vsumi);
793  vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
794  *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
795  __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));
796 }
797 #endif /*LV_HAVE_RVVSEG*/
798 
799 #endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H*/
static void volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_dot_prod_32fc.h:533
static void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_dot_prod_32fc.h:310
static void volk_32fc_x2_dot_prod_32fc_a_avx(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_dot_prod_32fc.h:606
static void volk_32fc_x2_dot_prod_32fc_u_avx(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_dot_prod_32fc.h:169
static void volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_dot_prod_32fc.h:376
static void volk_32fc_x2_dot_prod_32fc_generic(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_dot_prod_32fc.h:64
static void volk_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_dot_prod_32fc.h:105
static void volk_32fc_x2_dot_prod_32fc_neon_opttests(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_dot_prod_32fc.h:434
static void volk_32fc_x2_dot_prod_32fc_neon_optfma(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_dot_prod_32fc.h:485
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:62
#define lv_cmake(r, i)
Definition: volk_complex.h:77
float complex lv_32fc_t
Definition: volk_complex.h:74
for i
Definition: volk_config_fixed.tmpl.h:13
#define RISCV_SHRINK2(op, T, S, v)
Definition: volk_rvv_intrinsics.h:19
#define RISCV_SHRINK4(op, T, S, v)
Definition: volk_rvv_intrinsics.h:24