74 #ifndef INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_H
75 #define INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_H
83 #ifdef LV_HAVE_GENERIC
90 unsigned int num_points)
95 unsigned int number = num_points;
99 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * (*scalar);
100 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * (*scalar);
101 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * (*scalar);
102 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * (*scalar);
103 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * (*scalar);
104 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * (*scalar);
105 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * (*scalar);
106 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * (*scalar);
111 while (number-- > 0) {
112 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * (*scalar);
119 #include <immintrin.h>
127 unsigned int num_points)
129 unsigned int number = 0;
131 const unsigned int quarterPoints = num_points / 4;
132 unsigned int isodd = num_points & 3;
135 lv_32fc_t v_scalar[4] = { *scalar, *scalar, *scalar, *scalar };
142 s = _mm256_loadu_ps((
float*)v_scalar);
144 for (; number < quarterPoints; number++) {
145 x = _mm256_loadu_ps((
float*)b);
146 y = _mm256_loadu_ps((
float*)a);
148 z = _mm256_add_ps(y, z);
149 _mm256_storeu_ps((
float*)c, z);
156 for (
i = num_points - isodd;
i < num_points;
i++) {
157 *c++ = (*a++) +
lv_conj(*b++) * (*scalar);
164 #include <pmmintrin.h>
172 unsigned int num_points)
174 unsigned int number = 0;
175 const unsigned int halfPoints = num_points / 2;
178 lv_32fc_t v_scalar[2] = { *scalar, *scalar };
185 s = _mm_loadu_ps((
float*)v_scalar);
187 for (; number < halfPoints; number++) {
188 x = _mm_loadu_ps((
float*)b);
189 y = _mm_loadu_ps((
float*)a);
191 z = _mm_add_ps(y, z);
192 _mm_storeu_ps((
float*)c, z);
199 if ((num_points % 2) != 0) {
200 *c = *a +
lv_conj(*b) * (*scalar);
207 #include <immintrin.h>
215 unsigned int num_points)
217 unsigned int number = 0;
219 const unsigned int quarterPoints = num_points / 4;
220 unsigned int isodd = num_points & 3;
223 lv_32fc_t v_scalar[4] = { *scalar, *scalar, *scalar, *scalar };
230 s = _mm256_loadu_ps((
float*)v_scalar);
232 for (; number < quarterPoints; number++) {
233 x = _mm256_load_ps((
float*)b);
234 y = _mm256_load_ps((
float*)a);
236 z = _mm256_add_ps(y, z);
237 _mm256_store_ps((
float*)c, z);
244 for (
i = num_points - isodd;
i < num_points;
i++) {
245 *c++ = (*a++) +
lv_conj(*b++) * (*scalar);
252 #include <pmmintrin.h>
260 unsigned int num_points)
262 unsigned int number = 0;
263 const unsigned int halfPoints = num_points / 2;
266 lv_32fc_t v_scalar[2] = { *scalar, *scalar };
273 s = _mm_loadu_ps((
float*)v_scalar);
275 for (; number < halfPoints; number++) {
276 x = _mm_load_ps((
float*)b);
277 y = _mm_load_ps((
float*)a);
279 z = _mm_add_ps(y, z);
280 _mm_store_ps((
float*)c, z);
287 if ((num_points % 2) != 0) {
288 *c = *a +
lv_conj(*b) * (*scalar);
295 #include <arm_neon.h>
302 unsigned int num_points)
307 unsigned int number = num_points;
308 unsigned int quarter_points = num_points / 4;
310 float32x4x2_t a_val, b_val, c_val, scalar_val;
311 float32x4x2_t tmp_val;
313 scalar_val.val[0] = vld1q_dup_f32((
const float*)scalar);
314 scalar_val.val[1] = vld1q_dup_f32(((
const float*)scalar) + 1);
316 for (number = 0; number < quarter_points; ++number) {
317 a_val = vld2q_f32((
float*)aPtr);
318 b_val = vld2q_f32((
float*)bPtr);
319 b_val.val[1] = vnegq_f32(b_val.val[1]);
323 tmp_val.val[1] = vmulq_f32(b_val.val[1], scalar_val.val[0]);
324 tmp_val.val[0] = vmulq_f32(b_val.val[0], scalar_val.val[0]);
326 tmp_val.val[1] = vmlaq_f32(tmp_val.val[1], b_val.val[0], scalar_val.val[1]);
327 tmp_val.val[0] = vmlsq_f32(tmp_val.val[0], b_val.val[1], scalar_val.val[1]);
329 c_val.val[1] = vaddq_f32(a_val.val[1], tmp_val.val[1]);
330 c_val.val[0] = vaddq_f32(a_val.val[0], tmp_val.val[0]);
332 vst2q_f32((
float*)cPtr, c_val);
339 for (number = quarter_points * 4; number < num_points; number++) {
340 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * (*scalar);
static void volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t *scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h:86
static void volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t *scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h:298
static void volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_u_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t *scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h:168
static void volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t *scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h:123
static void volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t *scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h:211
static void volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_a_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t *scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h:256
static __m256 _mm256_complexconjugatemul_ps(const __m256 x, const __m256 y)
Definition: volk_avx_intrinsics.h:76
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
#define lv_conj(x)
Definition: volk_complex.h:100
float complex lv_32fc_t
Definition: volk_complex.h:74
for i
Definition: volk_config_fixed.tmpl.h:13
static __m128 _mm_complexconjugatemul_ps(__m128 x, __m128 y)
Definition: volk_sse3_intrinsics.h:31