48 #ifndef INCLUDED_volk_16ic_x2_dot_prod_16ic_H 49 #define INCLUDED_volk_16ic_x2_dot_prod_16ic_H 56 #ifdef LV_HAVE_GENERIC 61 unsigned int num_points)
63 result[0] =
lv_cmake((int16_t)0, (int16_t)0);
65 for (n = 0; n < num_points; n++) {
76 #include <emmintrin.h> 81 unsigned int num_points)
85 const unsigned int sse_iters = num_points / 4;
93 __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
97 realcacc = _mm_setzero_si128();
98 imagcacc = _mm_setzero_si128();
100 mask_imag = _mm_set_epi8(
101 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
102 mask_real = _mm_set_epi8(
103 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
105 for (number = 0; number < sse_iters; number++) {
110 b = _mm_load_si128((__m128i*)_in_b);
112 c = _mm_mullo_epi16(a, b);
114 c_sr = _mm_srli_si128(c, 2);
116 real = _mm_subs_epi16(c, c_sr);
118 b_sl = _mm_slli_si128(b, 2);
119 a_sl = _mm_slli_si128(a, 2);
121 imag1 = _mm_mullo_epi16(a, b_sl);
122 imag2 = _mm_mullo_epi16(b, a_sl);
124 imag = _mm_adds_epi16(imag1, imag2);
126 realcacc = _mm_adds_epi16(realcacc, real);
127 imagcacc = _mm_adds_epi16(imagcacc, imag);
133 realcacc = _mm_and_si128(realcacc, mask_real);
134 imagcacc = _mm_and_si128(imagcacc, mask_imag);
136 a = _mm_or_si128(realcacc, imagcacc);
138 _mm_store_si128((__m128i*)dotProductVector,
141 for (number = 0; number < 4; ++number) {
148 for (number = 0; number < (num_points % 4); ++number) {
161 #include <emmintrin.h> 166 unsigned int num_points)
170 const unsigned int sse_iters = num_points / 4;
178 __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
179 realcacc, imagcacc, result;
182 realcacc = _mm_setzero_si128();
183 imagcacc = _mm_setzero_si128();
185 mask_imag = _mm_set_epi8(
186 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
187 mask_real = _mm_set_epi8(
188 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
190 for (number = 0; number < sse_iters; number++) {
195 b = _mm_loadu_si128((__m128i*)_in_b);
197 c = _mm_mullo_epi16(a, b);
199 c_sr = _mm_srli_si128(c, 2);
201 real = _mm_subs_epi16(c, c_sr);
203 b_sl = _mm_slli_si128(b, 2);
204 a_sl = _mm_slli_si128(a, 2);
206 imag1 = _mm_mullo_epi16(a, b_sl);
207 imag2 = _mm_mullo_epi16(b, a_sl);
209 imag = _mm_adds_epi16(imag1, imag2);
211 realcacc = _mm_adds_epi16(realcacc, real);
212 imagcacc = _mm_adds_epi16(imagcacc, imag);
218 realcacc = _mm_and_si128(realcacc, mask_real);
219 imagcacc = _mm_and_si128(imagcacc, mask_imag);
221 result = _mm_or_si128(realcacc, imagcacc);
223 _mm_storeu_si128((__m128i*)dotProductVector,
226 for (number = 0; number < 4; ++number) {
233 for (number = 0; number < (num_points % 4); ++number) {
245 #include <immintrin.h> 247 static inline void volk_16ic_x2_dot_prod_16ic_u_axv2(
lv_16sc_t* out,
250 unsigned int num_points)
254 const unsigned int avx_iters = num_points / 8;
262 __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
263 realcacc, imagcacc, result;
266 realcacc = _mm256_setzero_si256();
267 imagcacc = _mm256_setzero_si256();
269 mask_imag = _mm256_set_epi8(0xFF,
301 mask_real = _mm256_set_epi8(0,
334 for (number = 0; number < avx_iters; number++) {
335 a = _mm256_loadu_si256((__m256i*)_in_a);
337 b = _mm256_loadu_si256((__m256i*)_in_b);
339 c = _mm256_mullo_epi16(a, b);
341 c_sr = _mm256_srli_si256(c, 2);
343 real = _mm256_subs_epi16(c, c_sr);
345 b_sl = _mm256_slli_si256(b, 2);
346 a_sl = _mm256_slli_si256(a, 2);
348 imag1 = _mm256_mullo_epi16(a, b_sl);
349 imag2 = _mm256_mullo_epi16(b, a_sl);
351 imag = _mm256_adds_epi16(imag1, imag2);
353 realcacc = _mm256_adds_epi16(realcacc, real);
354 imagcacc = _mm256_adds_epi16(imagcacc, imag);
360 realcacc = _mm256_and_si256(realcacc, mask_real);
361 imagcacc = _mm256_and_si256(imagcacc, mask_imag);
363 result = _mm256_or_si256(realcacc, imagcacc);
365 _mm256_storeu_si256((__m256i*)dotProductVector,
369 for (number = 0; number < 8; ++number) {
376 for (number = 0; number < (num_points % 8); ++number) {
388 #include <immintrin.h> 390 static inline void volk_16ic_x2_dot_prod_16ic_a_axv2(
lv_16sc_t* out,
393 unsigned int num_points)
397 const unsigned int avx_iters = num_points / 8;
405 __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
406 realcacc, imagcacc, result;
409 realcacc = _mm256_setzero_si256();
410 imagcacc = _mm256_setzero_si256();
412 mask_imag = _mm256_set_epi8(0xFF,
444 mask_real = _mm256_set_epi8(0,
477 for (number = 0; number < avx_iters; number++) {
478 a = _mm256_load_si256((__m256i*)_in_a);
480 b = _mm256_load_si256((__m256i*)_in_b);
482 c = _mm256_mullo_epi16(a, b);
484 c_sr = _mm256_srli_si256(c, 2);
486 real = _mm256_subs_epi16(c, c_sr);
488 b_sl = _mm256_slli_si256(b, 2);
489 a_sl = _mm256_slli_si256(a, 2);
491 imag1 = _mm256_mullo_epi16(a, b_sl);
492 imag2 = _mm256_mullo_epi16(b, a_sl);
494 imag = _mm256_adds_epi16(imag1, imag2);
496 realcacc = _mm256_adds_epi16(realcacc, real);
497 imagcacc = _mm256_adds_epi16(imagcacc, imag);
503 realcacc = _mm256_and_si256(realcacc, mask_real);
504 imagcacc = _mm256_and_si256(imagcacc, mask_imag);
506 result = _mm256_or_si256(realcacc, imagcacc);
508 _mm256_store_si256((__m256i*)dotProductVector,
512 for (number = 0; number < 8; ++number) {
519 for (number = 0; number < (num_points % 8); ++number) {
531 #include <arm_neon.h> 536 unsigned int num_points)
538 unsigned int quarter_points = num_points / 4;
543 *out =
lv_cmake((int16_t)0, (int16_t)0);
545 if (quarter_points > 0) {
548 int16x4x2_t a_val, b_val, c_val, accumulator;
549 int16x4x2_t tmp_real, tmp_imag;
551 accumulator.val[0] = vdup_n_s16(0);
552 accumulator.val[1] = vdup_n_s16(0);
555 for (number = 0; number < quarter_points; ++number) {
556 a_val = vld2_s16((int16_t*)a_ptr);
557 b_val = vld2_s16((int16_t*)b_ptr);
563 tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
565 tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]);
569 tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]);
571 tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
573 c_val.val[0] = vqsub_s16(tmp_real.val[0], tmp_real.val[1]);
574 c_val.val[1] = vqadd_s16(tmp_imag.val[0], tmp_imag.val[1]);
576 accumulator.val[0] = vqadd_s16(accumulator.val[0], c_val.val[0]);
577 accumulator.val[1] = vqadd_s16(accumulator.val[1], c_val.val[1]);
583 vst2_s16((int16_t*)accum_result, accumulator);
584 for (number = 0; number < 4; ++number) {
594 for (number = quarter_points * 4; number < num_points; ++number) {
595 *out += (*a_ptr++) * (*b_ptr++);
603 #include <arm_neon.h> 608 unsigned int num_points)
610 unsigned int quarter_points = num_points / 4;
617 int16x4x2_t a_val, b_val, accumulator;
620 accumulator.val[0] = vdup_n_s16(0);
621 accumulator.val[1] = vdup_n_s16(0);
623 for (number = 0; number < quarter_points; ++number) {
624 a_val = vld2_s16((int16_t*)a_ptr);
625 b_val = vld2_s16((int16_t*)b_ptr);
629 tmp.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
630 tmp.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
633 tmp.val[0] = vmls_s16(tmp.val[0], a_val.val[1], b_val.val[1]);
634 tmp.val[1] = vmla_s16(tmp.val[1], a_val.val[0], b_val.val[1]);
636 accumulator.val[0] = vqadd_s16(accumulator.val[0], tmp.val[0]);
637 accumulator.val[1] = vqadd_s16(accumulator.val[1], tmp.val[1]);
643 vst2_s16((int16_t*)accum_result, accumulator);
644 *out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
647 for (number = quarter_points * 4; number < num_points; ++number) {
648 *out += (*a_ptr++) * (*b_ptr++);
656 #include <arm_neon.h> 661 unsigned int num_points)
663 unsigned int quarter_points = num_points / 4;
670 int16x4x2_t a_val, b_val, accumulator1, accumulator2;
673 accumulator1.val[0] = vdup_n_s16(0);
674 accumulator1.val[1] = vdup_n_s16(0);
675 accumulator2.val[0] = vdup_n_s16(0);
676 accumulator2.val[1] = vdup_n_s16(0);
678 for (number = 0; number < quarter_points; ++number) {
679 a_val = vld2_s16((int16_t*)a_ptr);
680 b_val = vld2_s16((int16_t*)b_ptr);
685 accumulator1.val[0] = vmla_s16(accumulator1.val[0], a_val.val[0], b_val.val[0]);
686 accumulator2.val[0] = vmls_s16(accumulator2.val[0], a_val.val[1], b_val.val[1]);
687 accumulator1.val[1] = vmla_s16(accumulator1.val[1], a_val.val[0], b_val.val[1]);
688 accumulator2.val[1] = vmla_s16(accumulator2.val[1], a_val.val[1], b_val.val[0]);
694 accumulator1.val[0] = vqadd_s16(accumulator1.val[0], accumulator2.val[0]);
695 accumulator1.val[1] = vqadd_s16(accumulator1.val[1], accumulator2.val[1]);
697 vst2_s16((int16_t*)accum_result, accumulator1);
698 *out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
701 for (number = quarter_points * 4; number < num_points; ++number) {
702 *out += (*a_ptr++) * (*b_ptr++);
static int16_t sat_adds16i(int16_t x, int16_t y)
Definition: saturation_arithmetic.h:29
short complex lv_16sc_t
Definition: volk_complex.h:67
static void volk_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_dot_prod_16ic.h:163
#define lv_cmake(r, i)
Definition: volk_complex.h:73
static void volk_16ic_x2_dot_prod_16ic_neon(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_dot_prod_16ic.h:533
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
static void volk_16ic_x2_dot_prod_16ic_generic(lv_16sc_t *result, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_dot_prod_16ic.h:58
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
static void volk_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_dot_prod_16ic.h:78
static void volk_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_dot_prod_16ic.h:658
static void volk_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_dot_prod_16ic.h:605
#define lv_creal(x)
Definition: volk_complex.h:92
#define lv_cimag(x)
Definition: volk_complex.h:94