58 #ifndef INCLUDED_volk_32f_x2_dot_prod_16i_H 59 #define INCLUDED_volk_32f_x2_dot_prod_16i_H 65 #ifdef LV_HAVE_GENERIC 71 unsigned int num_points)
75 const float* aPtr = input;
76 const float* bPtr = taps;
77 unsigned int number = 0;
79 for (number = 0; number < num_points; number++) {
80 dotProduct += ((*aPtr++) * (*bPtr++));
83 *result = (int16_t)dotProduct;
94 unsigned int num_points)
97 unsigned int number = 0;
98 const unsigned int sixteenthPoints = num_points / 16;
100 float dotProduct = 0;
101 const float* aPtr = input;
102 const float* bPtr = taps;
104 __m128 a0Val, a1Val, a2Val, a3Val;
105 __m128 b0Val, b1Val, b2Val, b3Val;
106 __m128 c0Val, c1Val, c2Val, c3Val;
108 __m128 dotProdVal0 = _mm_setzero_ps();
109 __m128 dotProdVal1 = _mm_setzero_ps();
110 __m128 dotProdVal2 = _mm_setzero_ps();
111 __m128 dotProdVal3 = _mm_setzero_ps();
113 for (; number < sixteenthPoints; number++) {
115 a0Val = _mm_load_ps(aPtr);
116 a1Val = _mm_load_ps(aPtr + 4);
117 a2Val = _mm_load_ps(aPtr + 8);
118 a3Val = _mm_load_ps(aPtr + 12);
119 b0Val = _mm_load_ps(bPtr);
120 b1Val = _mm_load_ps(bPtr + 4);
121 b2Val = _mm_load_ps(bPtr + 8);
122 b3Val = _mm_load_ps(bPtr + 12);
124 c0Val = _mm_mul_ps(a0Val, b0Val);
125 c1Val = _mm_mul_ps(a1Val, b1Val);
126 c2Val = _mm_mul_ps(a2Val, b2Val);
127 c3Val = _mm_mul_ps(a3Val, b3Val);
129 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
130 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
131 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
132 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
138 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
139 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
140 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
144 _mm_store_ps(dotProductVector,
147 dotProduct = dotProductVector[0];
148 dotProduct += dotProductVector[1];
149 dotProduct += dotProductVector[2];
150 dotProduct += dotProductVector[3];
152 number = sixteenthPoints * 16;
153 for (; number < num_points; number++) {
154 dotProduct += ((*aPtr++) * (*bPtr++));
157 *result = (short)dotProduct;
163 #if LV_HAVE_AVX2 && LV_HAVE_FMA 165 static inline void volk_32f_x2_dot_prod_16i_a_avx2_fma(int16_t* result,
168 unsigned int num_points)
171 unsigned int number = 0;
172 const unsigned int thirtysecondPoints = num_points / 32;
174 float dotProduct = 0;
175 const float* aPtr = input;
176 const float* bPtr = taps;
178 __m256 a0Val, a1Val, a2Val, a3Val;
179 __m256 b0Val, b1Val, b2Val, b3Val;
181 __m256 dotProdVal0 = _mm256_setzero_ps();
182 __m256 dotProdVal1 = _mm256_setzero_ps();
183 __m256 dotProdVal2 = _mm256_setzero_ps();
184 __m256 dotProdVal3 = _mm256_setzero_ps();
186 for (; number < thirtysecondPoints; number++) {
188 a0Val = _mm256_load_ps(aPtr);
189 a1Val = _mm256_load_ps(aPtr + 8);
190 a2Val = _mm256_load_ps(aPtr + 16);
191 a3Val = _mm256_load_ps(aPtr + 24);
192 b0Val = _mm256_load_ps(bPtr);
193 b1Val = _mm256_load_ps(bPtr + 8);
194 b2Val = _mm256_load_ps(bPtr + 16);
195 b3Val = _mm256_load_ps(bPtr + 24);
197 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
198 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
199 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
200 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
206 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
207 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
208 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
212 _mm256_store_ps(dotProductVector,
215 dotProduct = dotProductVector[0];
216 dotProduct += dotProductVector[1];
217 dotProduct += dotProductVector[2];
218 dotProduct += dotProductVector[3];
219 dotProduct += dotProductVector[4];
220 dotProduct += dotProductVector[5];
221 dotProduct += dotProductVector[6];
222 dotProduct += dotProductVector[7];
224 number = thirtysecondPoints * 32;
225 for (; number < num_points; number++) {
226 dotProduct += ((*aPtr++) * (*bPtr++));
229 *result = (short)dotProduct;
240 unsigned int num_points)
243 unsigned int number = 0;
244 const unsigned int thirtysecondPoints = num_points / 32;
246 float dotProduct = 0;
247 const float* aPtr = input;
248 const float* bPtr = taps;
250 __m256 a0Val, a1Val, a2Val, a3Val;
251 __m256 b0Val, b1Val, b2Val, b3Val;
252 __m256 c0Val, c1Val, c2Val, c3Val;
254 __m256 dotProdVal0 = _mm256_setzero_ps();
255 __m256 dotProdVal1 = _mm256_setzero_ps();
256 __m256 dotProdVal2 = _mm256_setzero_ps();
257 __m256 dotProdVal3 = _mm256_setzero_ps();
259 for (; number < thirtysecondPoints; number++) {
261 a0Val = _mm256_load_ps(aPtr);
262 a1Val = _mm256_load_ps(aPtr + 8);
263 a2Val = _mm256_load_ps(aPtr + 16);
264 a3Val = _mm256_load_ps(aPtr + 24);
265 b0Val = _mm256_load_ps(bPtr);
266 b1Val = _mm256_load_ps(bPtr + 8);
267 b2Val = _mm256_load_ps(bPtr + 16);
268 b3Val = _mm256_load_ps(bPtr + 24);
270 c0Val = _mm256_mul_ps(a0Val, b0Val);
271 c1Val = _mm256_mul_ps(a1Val, b1Val);
272 c2Val = _mm256_mul_ps(a2Val, b2Val);
273 c3Val = _mm256_mul_ps(a3Val, b3Val);
275 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
276 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
277 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
278 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
284 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
285 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
286 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
290 _mm256_store_ps(dotProductVector,
293 dotProduct = dotProductVector[0];
294 dotProduct += dotProductVector[1];
295 dotProduct += dotProductVector[2];
296 dotProduct += dotProductVector[3];
297 dotProduct += dotProductVector[4];
298 dotProduct += dotProductVector[5];
299 dotProduct += dotProductVector[6];
300 dotProduct += dotProductVector[7];
302 number = thirtysecondPoints * 32;
303 for (; number < num_points; number++) {
304 dotProduct += ((*aPtr++) * (*bPtr++));
307 *result = (short)dotProduct;
312 #ifdef LV_HAVE_AVX512F 314 static inline void volk_32f_x2_dot_prod_16i_a_avx512f(int16_t* result,
317 unsigned int num_points)
320 unsigned int number = 0;
321 const unsigned int sixtyfourthPoints = num_points / 64;
323 float dotProduct = 0;
324 const float* aPtr = input;
325 const float* bPtr = taps;
327 __m512 a0Val, a1Val, a2Val, a3Val;
328 __m512 b0Val, b1Val, b2Val, b3Val;
330 __m512 dotProdVal0 = _mm512_setzero_ps();
331 __m512 dotProdVal1 = _mm512_setzero_ps();
332 __m512 dotProdVal2 = _mm512_setzero_ps();
333 __m512 dotProdVal3 = _mm512_setzero_ps();
335 for (; number < sixtyfourthPoints; number++) {
337 a0Val = _mm512_load_ps(aPtr);
338 a1Val = _mm512_load_ps(aPtr + 16);
339 a2Val = _mm512_load_ps(aPtr + 32);
340 a3Val = _mm512_load_ps(aPtr + 48);
341 b0Val = _mm512_load_ps(bPtr);
342 b1Val = _mm512_load_ps(bPtr + 16);
343 b2Val = _mm512_load_ps(bPtr + 32);
344 b3Val = _mm512_load_ps(bPtr + 48);
346 dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0);
347 dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1);
348 dotProdVal2 = _mm512_fmadd_ps(a2Val, b2Val, dotProdVal2);
349 dotProdVal3 = _mm512_fmadd_ps(a3Val, b3Val, dotProdVal3);
355 dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1);
356 dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal2);
357 dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal3);
361 _mm512_store_ps(dotProductVector,
364 dotProduct = dotProductVector[0];
365 dotProduct += dotProductVector[1];
366 dotProduct += dotProductVector[2];
367 dotProduct += dotProductVector[3];
368 dotProduct += dotProductVector[4];
369 dotProduct += dotProductVector[5];
370 dotProduct += dotProductVector[6];
371 dotProduct += dotProductVector[7];
372 dotProduct += dotProductVector[8];
373 dotProduct += dotProductVector[9];
374 dotProduct += dotProductVector[10];
375 dotProduct += dotProductVector[11];
376 dotProduct += dotProductVector[12];
377 dotProduct += dotProductVector[13];
378 dotProduct += dotProductVector[14];
379 dotProduct += dotProductVector[15];
381 number = sixtyfourthPoints * 64;
382 for (; number < num_points; number++) {
383 dotProduct += ((*aPtr++) * (*bPtr++));
386 *result = (short)dotProduct;
397 unsigned int num_points)
400 unsigned int number = 0;
401 const unsigned int sixteenthPoints = num_points / 16;
403 float dotProduct = 0;
404 const float* aPtr = input;
405 const float* bPtr = taps;
407 __m128 a0Val, a1Val, a2Val, a3Val;
408 __m128 b0Val, b1Val, b2Val, b3Val;
409 __m128 c0Val, c1Val, c2Val, c3Val;
411 __m128 dotProdVal0 = _mm_setzero_ps();
412 __m128 dotProdVal1 = _mm_setzero_ps();
413 __m128 dotProdVal2 = _mm_setzero_ps();
414 __m128 dotProdVal3 = _mm_setzero_ps();
416 for (; number < sixteenthPoints; number++) {
418 a0Val = _mm_loadu_ps(aPtr);
419 a1Val = _mm_loadu_ps(aPtr + 4);
420 a2Val = _mm_loadu_ps(aPtr + 8);
421 a3Val = _mm_loadu_ps(aPtr + 12);
422 b0Val = _mm_loadu_ps(bPtr);
423 b1Val = _mm_loadu_ps(bPtr + 4);
424 b2Val = _mm_loadu_ps(bPtr + 8);
425 b3Val = _mm_loadu_ps(bPtr + 12);
427 c0Val = _mm_mul_ps(a0Val, b0Val);
428 c1Val = _mm_mul_ps(a1Val, b1Val);
429 c2Val = _mm_mul_ps(a2Val, b2Val);
430 c3Val = _mm_mul_ps(a3Val, b3Val);
432 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
433 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
434 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
435 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
441 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
442 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
443 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
447 _mm_store_ps(dotProductVector,
450 dotProduct = dotProductVector[0];
451 dotProduct += dotProductVector[1];
452 dotProduct += dotProductVector[2];
453 dotProduct += dotProductVector[3];
455 number = sixteenthPoints * 16;
456 for (; number < num_points; number++) {
457 dotProduct += ((*aPtr++) * (*bPtr++));
460 *result = (short)dotProduct;
466 #if LV_HAVE_AVX2 && LV_HAVE_FMA 468 static inline void volk_32f_x2_dot_prod_16i_u_avx2_fma(int16_t* result,
471 unsigned int num_points)
474 unsigned int number = 0;
475 const unsigned int thirtysecondPoints = num_points / 32;
477 float dotProduct = 0;
478 const float* aPtr = input;
479 const float* bPtr = taps;
481 __m256 a0Val, a1Val, a2Val, a3Val;
482 __m256 b0Val, b1Val, b2Val, b3Val;
484 __m256 dotProdVal0 = _mm256_setzero_ps();
485 __m256 dotProdVal1 = _mm256_setzero_ps();
486 __m256 dotProdVal2 = _mm256_setzero_ps();
487 __m256 dotProdVal3 = _mm256_setzero_ps();
489 for (; number < thirtysecondPoints; number++) {
491 a0Val = _mm256_loadu_ps(aPtr);
492 a1Val = _mm256_loadu_ps(aPtr + 8);
493 a2Val = _mm256_loadu_ps(aPtr + 16);
494 a3Val = _mm256_loadu_ps(aPtr + 24);
495 b0Val = _mm256_loadu_ps(bPtr);
496 b1Val = _mm256_loadu_ps(bPtr + 8);
497 b2Val = _mm256_loadu_ps(bPtr + 16);
498 b3Val = _mm256_loadu_ps(bPtr + 24);
500 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
501 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
502 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
503 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
509 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
510 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
511 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
515 _mm256_store_ps(dotProductVector,
518 dotProduct = dotProductVector[0];
519 dotProduct += dotProductVector[1];
520 dotProduct += dotProductVector[2];
521 dotProduct += dotProductVector[3];
522 dotProduct += dotProductVector[4];
523 dotProduct += dotProductVector[5];
524 dotProduct += dotProductVector[6];
525 dotProduct += dotProductVector[7];
527 number = thirtysecondPoints * 32;
528 for (; number < num_points; number++) {
529 dotProduct += ((*aPtr++) * (*bPtr++));
532 *result = (short)dotProduct;
543 unsigned int num_points)
546 unsigned int number = 0;
547 const unsigned int thirtysecondPoints = num_points / 32;
549 float dotProduct = 0;
550 const float* aPtr = input;
551 const float* bPtr = taps;
553 __m256 a0Val, a1Val, a2Val, a3Val;
554 __m256 b0Val, b1Val, b2Val, b3Val;
555 __m256 c0Val, c1Val, c2Val, c3Val;
557 __m256 dotProdVal0 = _mm256_setzero_ps();
558 __m256 dotProdVal1 = _mm256_setzero_ps();
559 __m256 dotProdVal2 = _mm256_setzero_ps();
560 __m256 dotProdVal3 = _mm256_setzero_ps();
562 for (; number < thirtysecondPoints; number++) {
564 a0Val = _mm256_loadu_ps(aPtr);
565 a1Val = _mm256_loadu_ps(aPtr + 8);
566 a2Val = _mm256_loadu_ps(aPtr + 16);
567 a3Val = _mm256_loadu_ps(aPtr + 24);
568 b0Val = _mm256_loadu_ps(bPtr);
569 b1Val = _mm256_loadu_ps(bPtr + 8);
570 b2Val = _mm256_loadu_ps(bPtr + 16);
571 b3Val = _mm256_loadu_ps(bPtr + 24);
573 c0Val = _mm256_mul_ps(a0Val, b0Val);
574 c1Val = _mm256_mul_ps(a1Val, b1Val);
575 c2Val = _mm256_mul_ps(a2Val, b2Val);
576 c3Val = _mm256_mul_ps(a3Val, b3Val);
578 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
579 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
580 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
581 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
587 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
588 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
589 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
593 _mm256_store_ps(dotProductVector,
596 dotProduct = dotProductVector[0];
597 dotProduct += dotProductVector[1];
598 dotProduct += dotProductVector[2];
599 dotProduct += dotProductVector[3];
600 dotProduct += dotProductVector[4];
601 dotProduct += dotProductVector[5];
602 dotProduct += dotProductVector[6];
603 dotProduct += dotProductVector[7];
605 number = thirtysecondPoints * 32;
606 for (; number < num_points; number++) {
607 dotProduct += ((*aPtr++) * (*bPtr++));
610 *result = (short)dotProduct;
615 #ifdef LV_HAVE_AVX512F 617 static inline void volk_32f_x2_dot_prod_16i_u_avx512f(int16_t* result,
620 unsigned int num_points)
623 unsigned int number = 0;
624 const unsigned int sixtyfourthPoints = num_points / 64;
626 float dotProduct = 0;
627 const float* aPtr = input;
628 const float* bPtr = taps;
630 __m512 a0Val, a1Val, a2Val, a3Val;
631 __m512 b0Val, b1Val, b2Val, b3Val;
633 __m512 dotProdVal0 = _mm512_setzero_ps();
634 __m512 dotProdVal1 = _mm512_setzero_ps();
635 __m512 dotProdVal2 = _mm512_setzero_ps();
636 __m512 dotProdVal3 = _mm512_setzero_ps();
638 for (; number < sixtyfourthPoints; number++) {
640 a0Val = _mm512_loadu_ps(aPtr);
641 a1Val = _mm512_loadu_ps(aPtr + 16);
642 a2Val = _mm512_loadu_ps(aPtr + 32);
643 a3Val = _mm512_loadu_ps(aPtr + 48);
644 b0Val = _mm512_loadu_ps(bPtr);
645 b1Val = _mm512_loadu_ps(bPtr + 16);
646 b2Val = _mm512_loadu_ps(bPtr + 32);
647 b3Val = _mm512_loadu_ps(bPtr + 48);
649 dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0);
650 dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1);
651 dotProdVal2 = _mm512_fmadd_ps(a2Val, b2Val, dotProdVal2);
652 dotProdVal3 = _mm512_fmadd_ps(a3Val, b3Val, dotProdVal3);
658 dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1);
659 dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal2);
660 dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal3);
664 _mm512_storeu_ps(dotProductVector,
667 dotProduct = dotProductVector[0];
668 dotProduct += dotProductVector[1];
669 dotProduct += dotProductVector[2];
670 dotProduct += dotProductVector[3];
671 dotProduct += dotProductVector[4];
672 dotProduct += dotProductVector[5];
673 dotProduct += dotProductVector[6];
674 dotProduct += dotProductVector[7];
675 dotProduct += dotProductVector[8];
676 dotProduct += dotProductVector[9];
677 dotProduct += dotProductVector[10];
678 dotProduct += dotProductVector[11];
679 dotProduct += dotProductVector[12];
680 dotProduct += dotProductVector[13];
681 dotProduct += dotProductVector[14];
682 dotProduct += dotProductVector[15];
684 number = sixtyfourthPoints * 64;
685 for (; number < num_points; number++) {
686 dotProduct += ((*aPtr++) * (*bPtr++));
689 *result = (short)dotProduct;
static void volk_32f_x2_dot_prod_16i_generic(int16_t *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_16i.h:68
static void volk_32f_x2_dot_prod_16i_a_sse(int16_t *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_16i.h:91
static void volk_32f_x2_dot_prod_16i_u_sse(int16_t *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_16i.h:394
static void volk_32f_x2_dot_prod_16i_u_avx(int16_t *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_16i.h:540
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
static void volk_32f_x2_dot_prod_16i_a_avx(int16_t *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_16i.h:237