71 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H 72 #define INCLUDED_volk_32f_x2_dot_prod_32f_u_H 78 #ifdef LV_HAVE_GENERIC 84 unsigned int num_points)
88 const float* aPtr = input;
89 const float* bPtr = taps;
90 unsigned int number = 0;
92 for (number = 0; number < num_points; number++) {
93 dotProduct += ((*aPtr++) * (*bPtr++));
108 unsigned int num_points)
111 unsigned int number = 0;
112 const unsigned int sixteenthPoints = num_points / 16;
114 float dotProduct = 0;
115 const float* aPtr = input;
116 const float* bPtr = taps;
118 __m128 a0Val, a1Val, a2Val, a3Val;
119 __m128 b0Val, b1Val, b2Val, b3Val;
120 __m128 c0Val, c1Val, c2Val, c3Val;
122 __m128 dotProdVal0 = _mm_setzero_ps();
123 __m128 dotProdVal1 = _mm_setzero_ps();
124 __m128 dotProdVal2 = _mm_setzero_ps();
125 __m128 dotProdVal3 = _mm_setzero_ps();
127 for (; number < sixteenthPoints; number++) {
129 a0Val = _mm_loadu_ps(aPtr);
130 a1Val = _mm_loadu_ps(aPtr + 4);
131 a2Val = _mm_loadu_ps(aPtr + 8);
132 a3Val = _mm_loadu_ps(aPtr + 12);
133 b0Val = _mm_loadu_ps(bPtr);
134 b1Val = _mm_loadu_ps(bPtr + 4);
135 b2Val = _mm_loadu_ps(bPtr + 8);
136 b3Val = _mm_loadu_ps(bPtr + 12);
138 c0Val = _mm_mul_ps(a0Val, b0Val);
139 c1Val = _mm_mul_ps(a1Val, b1Val);
140 c2Val = _mm_mul_ps(a2Val, b2Val);
141 c3Val = _mm_mul_ps(a3Val, b3Val);
143 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
144 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
145 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
146 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
152 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
153 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
154 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
158 _mm_store_ps(dotProductVector,
161 dotProduct = dotProductVector[0];
162 dotProduct += dotProductVector[1];
163 dotProduct += dotProductVector[2];
164 dotProduct += dotProductVector[3];
166 number = sixteenthPoints * 16;
167 for (; number < num_points; number++) {
168 dotProduct += ((*aPtr++) * (*bPtr++));
171 *result = dotProduct;
178 #include <pmmintrin.h> 183 unsigned int num_points)
185 unsigned int number = 0;
186 const unsigned int sixteenthPoints = num_points / 16;
188 float dotProduct = 0;
189 const float* aPtr = input;
190 const float* bPtr = taps;
192 __m128 a0Val, a1Val, a2Val, a3Val;
193 __m128 b0Val, b1Val, b2Val, b3Val;
194 __m128 c0Val, c1Val, c2Val, c3Val;
196 __m128 dotProdVal0 = _mm_setzero_ps();
197 __m128 dotProdVal1 = _mm_setzero_ps();
198 __m128 dotProdVal2 = _mm_setzero_ps();
199 __m128 dotProdVal3 = _mm_setzero_ps();
201 for (; number < sixteenthPoints; number++) {
203 a0Val = _mm_loadu_ps(aPtr);
204 a1Val = _mm_loadu_ps(aPtr + 4);
205 a2Val = _mm_loadu_ps(aPtr + 8);
206 a3Val = _mm_loadu_ps(aPtr + 12);
207 b0Val = _mm_loadu_ps(bPtr);
208 b1Val = _mm_loadu_ps(bPtr + 4);
209 b2Val = _mm_loadu_ps(bPtr + 8);
210 b3Val = _mm_loadu_ps(bPtr + 12);
212 c0Val = _mm_mul_ps(a0Val, b0Val);
213 c1Val = _mm_mul_ps(a1Val, b1Val);
214 c2Val = _mm_mul_ps(a2Val, b2Val);
215 c3Val = _mm_mul_ps(a3Val, b3Val);
217 dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
218 dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
219 dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
220 dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
226 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
227 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
228 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
231 _mm_store_ps(dotProductVector,
234 dotProduct = dotProductVector[0];
235 dotProduct += dotProductVector[1];
236 dotProduct += dotProductVector[2];
237 dotProduct += dotProductVector[3];
239 number = sixteenthPoints * 16;
240 for (; number < num_points; number++) {
241 dotProduct += ((*aPtr++) * (*bPtr++));
244 *result = dotProduct;
249 #ifdef LV_HAVE_SSE4_1 251 #include <smmintrin.h> 253 static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(
float* result,
256 unsigned int num_points)
258 unsigned int number = 0;
259 const unsigned int sixteenthPoints = num_points / 16;
261 float dotProduct = 0;
262 const float* aPtr = input;
263 const float* bPtr = taps;
265 __m128 aVal1, bVal1, cVal1;
266 __m128 aVal2, bVal2, cVal2;
267 __m128 aVal3, bVal3, cVal3;
268 __m128 aVal4, bVal4, cVal4;
270 __m128 dotProdVal = _mm_setzero_ps();
272 for (; number < sixteenthPoints; number++) {
274 aVal1 = _mm_loadu_ps(aPtr);
276 aVal2 = _mm_loadu_ps(aPtr);
278 aVal3 = _mm_loadu_ps(aPtr);
280 aVal4 = _mm_loadu_ps(aPtr);
283 bVal1 = _mm_loadu_ps(bPtr);
285 bVal2 = _mm_loadu_ps(bPtr);
287 bVal3 = _mm_loadu_ps(bPtr);
289 bVal4 = _mm_loadu_ps(bPtr);
292 cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
293 cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
294 cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
295 cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
297 cVal1 = _mm_or_ps(cVal1, cVal2);
298 cVal3 = _mm_or_ps(cVal3, cVal4);
299 cVal1 = _mm_or_ps(cVal1, cVal3);
301 dotProdVal = _mm_add_ps(dotProdVal, cVal1);
305 _mm_store_ps(dotProductVector,
308 dotProduct = dotProductVector[0];
309 dotProduct += dotProductVector[1];
310 dotProduct += dotProductVector[2];
311 dotProduct += dotProductVector[3];
313 number = sixteenthPoints * 16;
314 for (; number < num_points; number++) {
315 dotProduct += ((*aPtr++) * (*bPtr++));
318 *result = dotProduct;
325 #include <immintrin.h> 330 unsigned int num_points)
333 unsigned int number = 0;
334 const unsigned int sixteenthPoints = num_points / 16;
336 float dotProduct = 0;
337 const float* aPtr = input;
338 const float* bPtr = taps;
344 __m256 dotProdVal0 = _mm256_setzero_ps();
345 __m256 dotProdVal1 = _mm256_setzero_ps();
347 for (; number < sixteenthPoints; number++) {
349 a0Val = _mm256_loadu_ps(aPtr);
350 a1Val = _mm256_loadu_ps(aPtr + 8);
351 b0Val = _mm256_loadu_ps(bPtr);
352 b1Val = _mm256_loadu_ps(bPtr + 8);
354 c0Val = _mm256_mul_ps(a0Val, b0Val);
355 c1Val = _mm256_mul_ps(a1Val, b1Val);
357 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
358 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
364 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
368 _mm256_storeu_ps(dotProductVector,
371 dotProduct = dotProductVector[0];
372 dotProduct += dotProductVector[1];
373 dotProduct += dotProductVector[2];
374 dotProduct += dotProductVector[3];
375 dotProduct += dotProductVector[4];
376 dotProduct += dotProductVector[5];
377 dotProduct += dotProductVector[6];
378 dotProduct += dotProductVector[7];
380 number = sixteenthPoints * 16;
381 for (; number < num_points; number++) {
382 dotProduct += ((*aPtr++) * (*bPtr++));
385 *result = dotProduct;
390 #if LV_HAVE_AVX2 && LV_HAVE_FMA 391 #include <immintrin.h> 392 static inline void volk_32f_x2_dot_prod_32f_u_avx2_fma(
float* result,
395 unsigned int num_points)
398 const unsigned int eighthPoints = num_points / 8;
400 const float* aPtr = input;
401 const float* bPtr = taps;
403 __m256 dotProdVal = _mm256_setzero_ps();
406 for (number = 0; number < eighthPoints; number++) {
408 aVal1 = _mm256_loadu_ps(aPtr);
409 bVal1 = _mm256_loadu_ps(bPtr);
413 dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal);
417 _mm256_storeu_ps(dotProductVector,
421 float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
422 dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
423 dotProductVector[6] + dotProductVector[7];
425 for (number = eighthPoints * 8; number < num_points; number++) {
426 dotProduct += ((*aPtr++) * (*bPtr++));
429 *result = dotProduct;
434 #include <immintrin.h> 435 static inline void volk_32f_x2_dot_prod_32f_u_avx512f(
float* result,
438 unsigned int num_points)
441 const unsigned int sixteenthPoints = num_points / 16;
443 const float* aPtr = input;
444 const float* bPtr = taps;
446 __m512 dotProdVal = _mm512_setzero_ps();
449 for (number = 0; number < sixteenthPoints; number++) {
451 aVal1 = _mm512_loadu_ps(aPtr);
452 bVal1 = _mm512_loadu_ps(bPtr);
456 dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal);
460 _mm512_storeu_ps(dotProductVector,
463 float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
464 dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
465 dotProductVector[6] + dotProductVector[7] + dotProductVector[8] +
466 dotProductVector[9] + dotProductVector[10] + dotProductVector[11] +
467 dotProductVector[12] + dotProductVector[13] +
468 dotProductVector[14] + dotProductVector[15];
470 for (number = sixteenthPoints * 16; number < num_points; number++) {
471 dotProduct += ((*aPtr++) * (*bPtr++));
474 *result = dotProduct;
480 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a_H 481 #define INCLUDED_volk_32f_x2_dot_prod_32f_a_H 487 #ifdef LV_HAVE_GENERIC 493 unsigned int num_points)
496 float dotProduct = 0;
497 const float* aPtr = input;
498 const float* bPtr = taps;
499 unsigned int number = 0;
501 for (number = 0; number < num_points; number++) {
502 dotProduct += ((*aPtr++) * (*bPtr++));
505 *result = dotProduct;
517 unsigned int num_points)
520 unsigned int number = 0;
521 const unsigned int sixteenthPoints = num_points / 16;
523 float dotProduct = 0;
524 const float* aPtr = input;
525 const float* bPtr = taps;
527 __m128 a0Val, a1Val, a2Val, a3Val;
528 __m128 b0Val, b1Val, b2Val, b3Val;
529 __m128 c0Val, c1Val, c2Val, c3Val;
531 __m128 dotProdVal0 = _mm_setzero_ps();
532 __m128 dotProdVal1 = _mm_setzero_ps();
533 __m128 dotProdVal2 = _mm_setzero_ps();
534 __m128 dotProdVal3 = _mm_setzero_ps();
536 for (; number < sixteenthPoints; number++) {
538 a0Val = _mm_load_ps(aPtr);
539 a1Val = _mm_load_ps(aPtr + 4);
540 a2Val = _mm_load_ps(aPtr + 8);
541 a3Val = _mm_load_ps(aPtr + 12);
542 b0Val = _mm_load_ps(bPtr);
543 b1Val = _mm_load_ps(bPtr + 4);
544 b2Val = _mm_load_ps(bPtr + 8);
545 b3Val = _mm_load_ps(bPtr + 12);
547 c0Val = _mm_mul_ps(a0Val, b0Val);
548 c1Val = _mm_mul_ps(a1Val, b1Val);
549 c2Val = _mm_mul_ps(a2Val, b2Val);
550 c3Val = _mm_mul_ps(a3Val, b3Val);
552 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
553 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
554 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
555 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
561 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
562 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
563 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
567 _mm_store_ps(dotProductVector,
570 dotProduct = dotProductVector[0];
571 dotProduct += dotProductVector[1];
572 dotProduct += dotProductVector[2];
573 dotProduct += dotProductVector[3];
575 number = sixteenthPoints * 16;
576 for (; number < num_points; number++) {
577 dotProduct += ((*aPtr++) * (*bPtr++));
580 *result = dotProduct;
587 #include <pmmintrin.h> 592 unsigned int num_points)
594 unsigned int number = 0;
595 const unsigned int sixteenthPoints = num_points / 16;
597 float dotProduct = 0;
598 const float* aPtr = input;
599 const float* bPtr = taps;
601 __m128 a0Val, a1Val, a2Val, a3Val;
602 __m128 b0Val, b1Val, b2Val, b3Val;
603 __m128 c0Val, c1Val, c2Val, c3Val;
605 __m128 dotProdVal0 = _mm_setzero_ps();
606 __m128 dotProdVal1 = _mm_setzero_ps();
607 __m128 dotProdVal2 = _mm_setzero_ps();
608 __m128 dotProdVal3 = _mm_setzero_ps();
610 for (; number < sixteenthPoints; number++) {
612 a0Val = _mm_load_ps(aPtr);
613 a1Val = _mm_load_ps(aPtr + 4);
614 a2Val = _mm_load_ps(aPtr + 8);
615 a3Val = _mm_load_ps(aPtr + 12);
616 b0Val = _mm_load_ps(bPtr);
617 b1Val = _mm_load_ps(bPtr + 4);
618 b2Val = _mm_load_ps(bPtr + 8);
619 b3Val = _mm_load_ps(bPtr + 12);
621 c0Val = _mm_mul_ps(a0Val, b0Val);
622 c1Val = _mm_mul_ps(a1Val, b1Val);
623 c2Val = _mm_mul_ps(a2Val, b2Val);
624 c3Val = _mm_mul_ps(a3Val, b3Val);
626 dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
627 dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
628 dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
629 dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
635 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
636 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
637 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
640 _mm_store_ps(dotProductVector,
643 dotProduct = dotProductVector[0];
644 dotProduct += dotProductVector[1];
645 dotProduct += dotProductVector[2];
646 dotProduct += dotProductVector[3];
648 number = sixteenthPoints * 16;
649 for (; number < num_points; number++) {
650 dotProduct += ((*aPtr++) * (*bPtr++));
653 *result = dotProduct;
658 #ifdef LV_HAVE_SSE4_1 660 #include <smmintrin.h> 662 static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(
float* result,
665 unsigned int num_points)
667 unsigned int number = 0;
668 const unsigned int sixteenthPoints = num_points / 16;
670 float dotProduct = 0;
671 const float* aPtr = input;
672 const float* bPtr = taps;
674 __m128 aVal1, bVal1, cVal1;
675 __m128 aVal2, bVal2, cVal2;
676 __m128 aVal3, bVal3, cVal3;
677 __m128 aVal4, bVal4, cVal4;
679 __m128 dotProdVal = _mm_setzero_ps();
681 for (; number < sixteenthPoints; number++) {
683 aVal1 = _mm_load_ps(aPtr);
685 aVal2 = _mm_load_ps(aPtr);
687 aVal3 = _mm_load_ps(aPtr);
689 aVal4 = _mm_load_ps(aPtr);
692 bVal1 = _mm_load_ps(bPtr);
694 bVal2 = _mm_load_ps(bPtr);
696 bVal3 = _mm_load_ps(bPtr);
698 bVal4 = _mm_load_ps(bPtr);
701 cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
702 cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
703 cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
704 cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
706 cVal1 = _mm_or_ps(cVal1, cVal2);
707 cVal3 = _mm_or_ps(cVal3, cVal4);
708 cVal1 = _mm_or_ps(cVal1, cVal3);
710 dotProdVal = _mm_add_ps(dotProdVal, cVal1);
714 _mm_store_ps(dotProductVector,
717 dotProduct = dotProductVector[0];
718 dotProduct += dotProductVector[1];
719 dotProduct += dotProductVector[2];
720 dotProduct += dotProductVector[3];
722 number = sixteenthPoints * 16;
723 for (; number < num_points; number++) {
724 dotProduct += ((*aPtr++) * (*bPtr++));
727 *result = dotProduct;
734 #include <immintrin.h> 739 unsigned int num_points)
742 unsigned int number = 0;
743 const unsigned int sixteenthPoints = num_points / 16;
745 float dotProduct = 0;
746 const float* aPtr = input;
747 const float* bPtr = taps;
753 __m256 dotProdVal0 = _mm256_setzero_ps();
754 __m256 dotProdVal1 = _mm256_setzero_ps();
756 for (; number < sixteenthPoints; number++) {
758 a0Val = _mm256_load_ps(aPtr);
759 a1Val = _mm256_load_ps(aPtr + 8);
760 b0Val = _mm256_load_ps(bPtr);
761 b1Val = _mm256_load_ps(bPtr + 8);
763 c0Val = _mm256_mul_ps(a0Val, b0Val);
764 c1Val = _mm256_mul_ps(a1Val, b1Val);
766 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
767 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
773 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
777 _mm256_store_ps(dotProductVector,
780 dotProduct = dotProductVector[0];
781 dotProduct += dotProductVector[1];
782 dotProduct += dotProductVector[2];
783 dotProduct += dotProductVector[3];
784 dotProduct += dotProductVector[4];
785 dotProduct += dotProductVector[5];
786 dotProduct += dotProductVector[6];
787 dotProduct += dotProductVector[7];
789 number = sixteenthPoints * 16;
790 for (; number < num_points; number++) {
791 dotProduct += ((*aPtr++) * (*bPtr++));
794 *result = dotProduct;
799 #if LV_HAVE_AVX2 && LV_HAVE_FMA 800 #include <immintrin.h> 801 static inline void volk_32f_x2_dot_prod_32f_a_avx2_fma(
float* result,
804 unsigned int num_points)
807 const unsigned int eighthPoints = num_points / 8;
809 const float* aPtr = input;
810 const float* bPtr = taps;
812 __m256 dotProdVal = _mm256_setzero_ps();
815 for (number = 0; number < eighthPoints; number++) {
817 aVal1 = _mm256_load_ps(aPtr);
818 bVal1 = _mm256_load_ps(bPtr);
822 dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal);
826 _mm256_store_ps(dotProductVector,
830 float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
831 dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
832 dotProductVector[6] + dotProductVector[7];
834 for (number = eighthPoints * 8; number < num_points; number++) {
835 dotProduct += ((*aPtr++) * (*bPtr++));
838 *result = dotProduct;
843 #include <immintrin.h> 844 static inline void volk_32f_x2_dot_prod_32f_a_avx512f(
float* result,
847 unsigned int num_points)
850 const unsigned int sixteenthPoints = num_points / 16;
852 const float* aPtr = input;
853 const float* bPtr = taps;
855 __m512 dotProdVal = _mm512_setzero_ps();
858 for (number = 0; number < sixteenthPoints; number++) {
860 aVal1 = _mm512_load_ps(aPtr);
861 bVal1 = _mm512_load_ps(bPtr);
865 dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal);
869 _mm512_store_ps(dotProductVector,
872 float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
873 dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
874 dotProductVector[6] + dotProductVector[7] + dotProductVector[8] +
875 dotProductVector[9] + dotProductVector[10] + dotProductVector[11] +
876 dotProductVector[12] + dotProductVector[13] +
877 dotProductVector[14] + dotProductVector[15];
879 for (number = sixteenthPoints * 16; number < num_points; number++) {
880 dotProduct += ((*aPtr++) * (*bPtr++));
883 *result = dotProduct;
888 #include <arm_neon.h> 893 unsigned int num_points)
896 unsigned int quarter_points = num_points / 16;
897 float dotProduct = 0;
898 const float* aPtr = input;
899 const float* bPtr = taps;
900 unsigned int number = 0;
902 float32x4x4_t a_val, b_val, accumulator0;
903 accumulator0.val[0] = vdupq_n_f32(0);
904 accumulator0.val[1] = vdupq_n_f32(0);
905 accumulator0.val[2] = vdupq_n_f32(0);
906 accumulator0.val[3] = vdupq_n_f32(0);
909 for (number = 0; number < quarter_points; ++number) {
910 a_val = vld4q_f32(aPtr);
911 b_val = vld4q_f32(bPtr);
912 accumulator0.val[0] = vmlaq_f32(accumulator0.val[0], a_val.val[0], b_val.val[0]);
913 accumulator0.val[1] = vmlaq_f32(accumulator0.val[1], a_val.val[1], b_val.val[1]);
914 accumulator0.val[2] = vmlaq_f32(accumulator0.val[2], a_val.val[2], b_val.val[2]);
915 accumulator0.val[3] = vmlaq_f32(accumulator0.val[3], a_val.val[3], b_val.val[3]);
919 accumulator0.val[0] = vaddq_f32(accumulator0.val[0], accumulator0.val[1]);
920 accumulator0.val[2] = vaddq_f32(accumulator0.val[2], accumulator0.val[3]);
921 accumulator0.val[0] = vaddq_f32(accumulator0.val[2], accumulator0.val[0]);
923 vst1q_f32(accumulator, accumulator0.val[0]);
924 dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
926 for (number = quarter_points * 16; number < num_points; number++) {
927 dotProduct += ((*aPtr++) * (*bPtr++));
930 *result = dotProduct;
940 unsigned int num_points)
943 unsigned int quarter_points = num_points / 8;
944 float dotProduct = 0;
945 const float* aPtr = input;
946 const float* bPtr = taps;
947 unsigned int number = 0;
949 float32x4x2_t a_val, b_val, accumulator_val;
950 accumulator_val.val[0] = vdupq_n_f32(0);
951 accumulator_val.val[1] = vdupq_n_f32(0);
953 for (number = 0; number < quarter_points; ++number) {
954 a_val = vld2q_f32(aPtr);
955 b_val = vld2q_f32(bPtr);
956 accumulator_val.val[0] =
957 vmlaq_f32(accumulator_val.val[0], a_val.val[0], b_val.val[0]);
958 accumulator_val.val[1] =
959 vmlaq_f32(accumulator_val.val[1], a_val.val[1], b_val.val[1]);
963 accumulator_val.val[0] = vaddq_f32(accumulator_val.val[0], accumulator_val.val[1]);
965 vst1q_f32(accumulator, accumulator_val.val[0]);
966 dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
968 for (number = quarter_points * 8; number < num_points; number++) {
969 dotProduct += ((*aPtr++) * (*bPtr++));
972 *result = dotProduct;
977 #ifdef LV_HAVE_NEONV7 978 extern void volk_32f_x2_dot_prod_32f_a_neonasm(
float* cVector,
979 const float* aVector,
980 const float* bVector,
981 unsigned int num_points);
984 #ifdef LV_HAVE_NEONV7 985 extern void volk_32f_x2_dot_prod_32f_a_neonasm_opts(
float* cVector,
986 const float* aVector,
987 const float* bVector,
988 unsigned int num_points);
static void volk_32f_x2_dot_prod_32f_u_avx(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:327
static void volk_32f_x2_dot_prod_32f_a_generic(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:490
static void volk_32f_x2_dot_prod_32f_a_sse3(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:589
static void volk_32f_x2_dot_prod_32f_a_avx(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:736
static void volk_32f_x2_dot_prod_32f_u_sse(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:105
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
static void volk_32f_x2_dot_prod_32f_u_sse3(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:180
static void volk_32f_x2_dot_prod_32f_a_sse(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:514
static void volk_32f_x2_dot_prod_32f_neon(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:937
static void volk_32f_x2_dot_prod_32f_generic(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:81
static void volk_32f_x2_dot_prod_32f_neonopts(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:890