70 #ifndef INCLUDED_volk_32fc_x2_multiply_32fc_u_H 71 #define INCLUDED_volk_32fc_x2_multiply_32fc_u_H 78 #if LV_HAVE_AVX2 && LV_HAVE_FMA 79 #include <immintrin.h> 87 static inline void volk_32fc_x2_multiply_32fc_u_avx2_fma(
lv_32fc_t* cVector,
90 unsigned int num_points)
92 unsigned int number = 0;
93 const unsigned int quarterPoints = num_points / 4;
99 for (; number < quarterPoints; number++) {
102 _mm256_loadu_ps((
float*)a);
104 _mm256_loadu_ps((
float*)b);
106 const __m256 yl = _mm256_moveldup_ps(y);
107 const __m256 yh = _mm256_movehdup_ps(y);
109 const __m256 tmp2x = _mm256_permute_ps(x, 0xB1);
111 const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh);
113 const __m256 z = _mm256_fmaddsub_ps(
116 _mm256_storeu_ps((
float*)c, z);
125 number = quarterPoints * 4;
126 for (; number < num_points; number++) {
127 *c++ = (*a++) * (*b++);
134 #include <immintrin.h> 140 unsigned int num_points)
142 unsigned int number = 0;
143 const unsigned int quarterPoints = num_points / 4;
150 for (; number < quarterPoints; number++) {
156 _mm256_storeu_ps((
float*)c, z);
163 number = quarterPoints * 4;
165 for (; number < num_points; number++) {
166 *c++ = (*a++) * (*b++);
173 #include <pmmintrin.h> 179 unsigned int num_points)
181 unsigned int number = 0;
182 const unsigned int halfPoints = num_points / 2;
189 for (; number < halfPoints; number++) {
190 x = _mm_loadu_ps((
float*)a);
191 y = _mm_loadu_ps((
float*)b);
193 _mm_storeu_ps((
float*)c, z);
200 if ((num_points % 2) != 0) {
207 #ifdef LV_HAVE_GENERIC 212 unsigned int num_points)
217 unsigned int number = 0;
219 for (number = 0; number < num_points; number++) {
220 *cPtr++ = (*aPtr++) * (*bPtr++);
227 #ifndef INCLUDED_volk_32fc_x2_multiply_32fc_a_H 228 #define INCLUDED_volk_32fc_x2_multiply_32fc_a_H 231 #include <inttypes.h> 235 #if LV_HAVE_AVX2 && LV_HAVE_FMA 236 #include <immintrin.h> 244 static inline void volk_32fc_x2_multiply_32fc_a_avx2_fma(
lv_32fc_t* cVector,
247 unsigned int num_points)
249 unsigned int number = 0;
250 const unsigned int quarterPoints = num_points / 4;
256 for (; number < quarterPoints; number++) {
259 _mm256_load_ps((
float*)a);
261 _mm256_load_ps((
float*)b);
263 const __m256 yl = _mm256_moveldup_ps(y);
264 const __m256 yh = _mm256_movehdup_ps(y);
266 const __m256 tmp2x = _mm256_permute_ps(x, 0xB1);
268 const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh);
270 const __m256 z = _mm256_fmaddsub_ps(
273 _mm256_store_ps((
float*)c, z);
282 number = quarterPoints * 4;
283 for (; number < num_points; number++) {
284 *c++ = (*a++) * (*b++);
291 #include <immintrin.h> 297 unsigned int num_points)
299 unsigned int number = 0;
300 const unsigned int quarterPoints = num_points / 4;
307 for (; number < quarterPoints; number++) {
308 x = _mm256_load_ps((
float*)a);
309 y = _mm256_load_ps((
float*)b);
311 _mm256_store_ps((
float*)c, z);
318 number = quarterPoints * 4;
320 for (; number < num_points; number++) {
321 *c++ = (*a++) * (*b++);
327 #include <pmmintrin.h> 333 unsigned int num_points)
335 unsigned int number = 0;
336 const unsigned int halfPoints = num_points / 2;
343 for (; number < halfPoints; number++) {
344 x = _mm_load_ps((
float*)a);
345 y = _mm_load_ps((
float*)b);
347 _mm_store_ps((
float*)c, z);
354 if ((num_points % 2) != 0) {
361 #ifdef LV_HAVE_GENERIC 366 unsigned int num_points)
371 unsigned int number = 0;
373 for (number = 0; number < num_points; number++) {
374 *cPtr++ = (*aPtr++) * (*bPtr++);
381 #include <arm_neon.h> 386 unsigned int num_points)
390 unsigned int quarter_points = num_points / 4;
391 float32x4x2_t a_val, b_val, c_val;
392 float32x4x2_t tmp_real, tmp_imag;
393 unsigned int number = 0;
395 for (number = 0; number < quarter_points; ++number) {
396 a_val = vld2q_f32((
float*)a_ptr);
397 b_val = vld2q_f32((
float*)b_ptr);
403 tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
405 tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
409 tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
411 tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
414 c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
415 c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
416 vst2q_f32((
float*)cVector, c_val);
423 for (number = quarter_points * 4; number < num_points; number++) {
424 *cVector++ = (*a_ptr++) * (*b_ptr++);
435 unsigned int num_points)
439 unsigned int quarter_points = num_points / 4;
440 float32x4x2_t a_val, b_val;
441 float32x4x2_t tmp_imag;
442 unsigned int number = 0;
444 for (number = 0; number < quarter_points; ++number) {
445 a_val = vld2q_f32((
float*)a_ptr);
446 b_val = vld2q_f32((
float*)b_ptr);
451 tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
452 tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
455 tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
456 tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
459 vst2q_f32((
float*)cVector, tmp_imag);
466 for (number = quarter_points * 4; number < num_points; number++) {
467 *cVector++ = (*a_ptr++) * (*b_ptr++);
473 #ifdef LV_HAVE_NEONV7 475 extern void volk_32fc_x2_multiply_32fc_a_neonasm(
lv_32fc_t* cVector,
478 unsigned int num_points);
484 extern void volk_32fc_x2_multiply_32fc_a_orc_impl(
lv_32fc_t* cVector,
487 unsigned int num_points);
489 static inline void volk_32fc_x2_multiply_32fc_u_orc(
lv_32fc_t* cVector,
492 unsigned int num_points)
494 volk_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
static void volk_32fc_x2_multiply_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:294
static __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
Definition: volk_avx_intrinsics.h:32
static void volk_32fc_x2_multiply_32fc_a_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:363
static void volk_32fc_x2_multiply_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:209
static __m128 _mm_complexmul_ps(__m128 x, __m128 y)
Definition: volk_sse3_intrinsics.h:32
static void volk_32fc_x2_multiply_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:383
static void volk_32fc_x2_multiply_32fc_neon_opttests(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:432
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
static void volk_32fc_x2_multiply_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:137
float complex lv_32fc_t
Definition: volk_complex.h:70
static void volk_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:176
static void volk_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:330