Vector Optimized Library of Kernels  3.2.0
Architecture-tuned implementations of math kernels
volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2019 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
74 #ifndef INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_H
75 #define INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_H
76 
77 #include <float.h>
78 #include <inttypes.h>
79 #include <stdio.h>
80 #include <volk/volk_complex.h>
81 
82 
83 #ifdef LV_HAVE_GENERIC
84 
85 static inline void
87  const lv_32fc_t* aVector,
88  const lv_32fc_t* bVector,
89  const lv_32fc_t* scalar,
90  unsigned int num_points)
91 {
92  const lv_32fc_t* aPtr = aVector;
93  const lv_32fc_t* bPtr = bVector;
94  lv_32fc_t* cPtr = cVector;
95  unsigned int number = num_points;
96 
97  // unwrap loop
98  while (number >= 8) {
99  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
100  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
101  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
102  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
103  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
104  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
105  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
106  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
107  number -= 8;
108  }
109 
110  // clean up any remaining
111  while (number-- > 0) {
112  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
113  }
114 }
115 #endif /* LV_HAVE_GENERIC */
116 
117 
118 #ifdef LV_HAVE_AVX
119 #include <immintrin.h>
121 
122 static inline void
124  const lv_32fc_t* aVector,
125  const lv_32fc_t* bVector,
126  const lv_32fc_t* scalar,
127  unsigned int num_points)
128 {
129  unsigned int number = 0;
130  unsigned int i = 0;
131  const unsigned int quarterPoints = num_points / 4;
132  unsigned int isodd = num_points & 3;
133 
134  __m256 x, y, s, z;
135  lv_32fc_t v_scalar[4] = { *scalar, *scalar, *scalar, *scalar };
136 
137  const lv_32fc_t* a = aVector;
138  const lv_32fc_t* b = bVector;
139  lv_32fc_t* c = cVector;
140 
141  // Set up constant scalar vector
142  s = _mm256_loadu_ps((float*)v_scalar);
143 
144  for (; number < quarterPoints; number++) {
145  x = _mm256_loadu_ps((float*)b);
146  y = _mm256_loadu_ps((float*)a);
148  z = _mm256_add_ps(y, z);
149  _mm256_storeu_ps((float*)c, z);
150 
151  a += 4;
152  b += 4;
153  c += 4;
154  }
155 
156  for (i = num_points - isodd; i < num_points; i++) {
157  *c++ = (*a++) + lv_conj(*b++) * (*scalar);
158  }
159 }
160 #endif /* LV_HAVE_AVX */
161 
162 
163 #ifdef LV_HAVE_SSE3
164 #include <pmmintrin.h>
166 
167 static inline void
169  const lv_32fc_t* aVector,
170  const lv_32fc_t* bVector,
171  const lv_32fc_t* scalar,
172  unsigned int num_points)
173 {
174  unsigned int number = 0;
175  const unsigned int halfPoints = num_points / 2;
176 
177  __m128 x, y, s, z;
178  lv_32fc_t v_scalar[2] = { *scalar, *scalar };
179 
180  const lv_32fc_t* a = aVector;
181  const lv_32fc_t* b = bVector;
182  lv_32fc_t* c = cVector;
183 
184  // Set up constant scalar vector
185  s = _mm_loadu_ps((float*)v_scalar);
186 
187  for (; number < halfPoints; number++) {
188  x = _mm_loadu_ps((float*)b);
189  y = _mm_loadu_ps((float*)a);
190  z = _mm_complexconjugatemul_ps(s, x);
191  z = _mm_add_ps(y, z);
192  _mm_storeu_ps((float*)c, z);
193 
194  a += 2;
195  b += 2;
196  c += 2;
197  }
198 
199  if ((num_points % 2) != 0) {
200  *c = *a + lv_conj(*b) * (*scalar);
201  }
202 }
203 #endif /* LV_HAVE_SSE */
204 
205 
206 #ifdef LV_HAVE_AVX
207 #include <immintrin.h>
209 
210 static inline void
212  const lv_32fc_t* aVector,
213  const lv_32fc_t* bVector,
214  const lv_32fc_t* scalar,
215  unsigned int num_points)
216 {
217  unsigned int number = 0;
218  unsigned int i = 0;
219  const unsigned int quarterPoints = num_points / 4;
220  unsigned int isodd = num_points & 3;
221 
222  __m256 x, y, s, z;
223  lv_32fc_t v_scalar[4] = { *scalar, *scalar, *scalar, *scalar };
224 
225  const lv_32fc_t* a = aVector;
226  const lv_32fc_t* b = bVector;
227  lv_32fc_t* c = cVector;
228 
229  // Set up constant scalar vector
230  s = _mm256_loadu_ps((float*)v_scalar);
231 
232  for (; number < quarterPoints; number++) {
233  x = _mm256_load_ps((float*)b);
234  y = _mm256_load_ps((float*)a);
236  z = _mm256_add_ps(y, z);
237  _mm256_store_ps((float*)c, z);
238 
239  a += 4;
240  b += 4;
241  c += 4;
242  }
243 
244  for (i = num_points - isodd; i < num_points; i++) {
245  *c++ = (*a++) + lv_conj(*b++) * (*scalar);
246  }
247 }
248 #endif /* LV_HAVE_AVX */
249 
250 
251 #ifdef LV_HAVE_SSE3
252 #include <pmmintrin.h>
254 
255 static inline void
257  const lv_32fc_t* aVector,
258  const lv_32fc_t* bVector,
259  const lv_32fc_t* scalar,
260  unsigned int num_points)
261 {
262  unsigned int number = 0;
263  const unsigned int halfPoints = num_points / 2;
264 
265  __m128 x, y, s, z;
266  lv_32fc_t v_scalar[2] = { *scalar, *scalar };
267 
268  const lv_32fc_t* a = aVector;
269  const lv_32fc_t* b = bVector;
270  lv_32fc_t* c = cVector;
271 
272  // Set up constant scalar vector
273  s = _mm_loadu_ps((float*)v_scalar);
274 
275  for (; number < halfPoints; number++) {
276  x = _mm_load_ps((float*)b);
277  y = _mm_load_ps((float*)a);
278  z = _mm_complexconjugatemul_ps(s, x);
279  z = _mm_add_ps(y, z);
280  _mm_store_ps((float*)c, z);
281 
282  a += 2;
283  b += 2;
284  c += 2;
285  }
286 
287  if ((num_points % 2) != 0) {
288  *c = *a + lv_conj(*b) * (*scalar);
289  }
290 }
291 #endif /* LV_HAVE_SSE */
292 
293 
294 #ifdef LV_HAVE_NEON
295 #include <arm_neon.h>
296 
297 static inline void
299  const lv_32fc_t* aVector,
300  const lv_32fc_t* bVector,
301  const lv_32fc_t* scalar,
302  unsigned int num_points)
303 {
304  const lv_32fc_t* bPtr = bVector;
305  const lv_32fc_t* aPtr = aVector;
306  lv_32fc_t* cPtr = cVector;
307  unsigned int number = num_points;
308  unsigned int quarter_points = num_points / 4;
309 
310  float32x4x2_t a_val, b_val, c_val, scalar_val;
311  float32x4x2_t tmp_val;
312 
313  scalar_val.val[0] = vld1q_dup_f32((const float*)scalar);
314  scalar_val.val[1] = vld1q_dup_f32(((const float*)scalar) + 1);
315 
316  for (number = 0; number < quarter_points; ++number) {
317  a_val = vld2q_f32((float*)aPtr);
318  b_val = vld2q_f32((float*)bPtr);
319  b_val.val[1] = vnegq_f32(b_val.val[1]);
320  __VOLK_PREFETCH(aPtr + 8);
321  __VOLK_PREFETCH(bPtr + 8);
322 
323  tmp_val.val[1] = vmulq_f32(b_val.val[1], scalar_val.val[0]);
324  tmp_val.val[0] = vmulq_f32(b_val.val[0], scalar_val.val[0]);
325 
326  tmp_val.val[1] = vmlaq_f32(tmp_val.val[1], b_val.val[0], scalar_val.val[1]);
327  tmp_val.val[0] = vmlsq_f32(tmp_val.val[0], b_val.val[1], scalar_val.val[1]);
328 
329  c_val.val[1] = vaddq_f32(a_val.val[1], tmp_val.val[1]);
330  c_val.val[0] = vaddq_f32(a_val.val[0], tmp_val.val[0]);
331 
332  vst2q_f32((float*)cPtr, c_val);
333 
334  aPtr += 4;
335  bPtr += 4;
336  cPtr += 4;
337  }
338 
339  for (number = quarter_points * 4; number < num_points; number++) {
340  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
341  }
342 }
343 #endif /* LV_HAVE_NEON */
344 
345 #ifdef LV_HAVE_RVV
346 #include <riscv_vector.h>
347 
348 static inline void
349 volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_rvv(lv_32fc_t* cVector,
350  const lv_32fc_t* aVector,
351  const lv_32fc_t* bVector,
352  const lv_32fc_t* scalar,
353  unsigned int num_points)
354 {
355  vfloat32m2_t vbr =
356  __riscv_vfmv_v_f_f32m2(lv_creal(*scalar), __riscv_vsetvlmax_e32m2());
357  vfloat32m2_t vbi =
358  __riscv_vfmv_v_f_f32m2(lv_cimag(*scalar), __riscv_vsetvlmax_e32m2());
359  size_t n = num_points;
360  for (size_t vl; n > 0; n -= vl, bVector += vl, aVector += vl, cVector += vl) {
361  vl = __riscv_vsetvl_e32m2(n);
362  vuint64m4_t va = __riscv_vle64_v_u64m4((const uint64_t*)bVector, vl);
363  vuint64m4_t vc = __riscv_vle64_v_u64m4((const uint64_t*)aVector, vl);
364  vfloat32m2_t var = __riscv_vreinterpret_f32m2(__riscv_vnsrl(va, 0, vl));
365  vfloat32m2_t vcr = __riscv_vreinterpret_f32m2(__riscv_vnsrl(vc, 0, vl));
366  vfloat32m2_t vai = __riscv_vreinterpret_f32m2(__riscv_vnsrl(va, 32, vl));
367  vfloat32m2_t vci = __riscv_vreinterpret_f32m2(__riscv_vnsrl(vc, 32, vl));
368  vfloat32m2_t vr = __riscv_vfmacc(__riscv_vfmul(var, vbr, vl), vai, vbi, vl);
369  vfloat32m2_t vi = __riscv_vfnmsac(__riscv_vfmul(var, vbi, vl), vai, vbr, vl);
370  vuint32m2_t vru = __riscv_vreinterpret_u32m2(__riscv_vfadd(vr, vcr, vl));
371  vuint32m2_t viu = __riscv_vreinterpret_u32m2(__riscv_vfadd(vi, vci, vl));
372  vuint64m4_t v =
373  __riscv_vwmaccu(__riscv_vwaddu_vv(vru, viu, vl), 0xFFFFFFFF, viu, vl);
374  __riscv_vse64((uint64_t*)cVector, v, vl);
375  }
376 }
377 #endif /*LV_HAVE_RVV*/
378 
379 #ifdef LV_HAVE_RVVSEG
380 #include <riscv_vector.h>
381 
382 static inline void
383 volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_rvvseg(lv_32fc_t* cVector,
384  const lv_32fc_t* aVector,
385  const lv_32fc_t* bVector,
386  const lv_32fc_t* scalar,
387  unsigned int num_points)
388 {
389  vfloat32m4_t vbr =
390  __riscv_vfmv_v_f_f32m4(lv_creal(*scalar), __riscv_vsetvlmax_e32m4());
391  vfloat32m4_t vbi =
392  __riscv_vfmv_v_f_f32m4(lv_cimag(*scalar), __riscv_vsetvlmax_e32m4());
393  size_t n = num_points;
394  for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
395  vl = __riscv_vsetvl_e32m4(n);
396  vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)aVector, vl);
397  vfloat32m4x2_t va = __riscv_vlseg2e32_v_f32m4x2((const float*)bVector, vl);
398  vfloat32m4_t vcr = __riscv_vget_f32m4(vc, 0), vci = __riscv_vget_f32m4(vc, 1);
399  vfloat32m4_t var = __riscv_vget_f32m4(va, 0), vai = __riscv_vget_f32m4(va, 1);
400  vfloat32m4_t vr = __riscv_vfmacc(__riscv_vfmul(var, vbr, vl), vai, vbi, vl);
401  vfloat32m4_t vi = __riscv_vfnmsac(__riscv_vfmul(var, vbi, vl), vai, vbr, vl);
402  vr = __riscv_vfadd(vr, vcr, vl);
403  vi = __riscv_vfadd(vi, vci, vl);
404  __riscv_vsseg2e32_v_f32m4x2(
405  (float*)cVector, __riscv_vcreate_v_f32m4x2(vr, vi), vl);
406  }
407 }
408 #endif /*LV_HAVE_RVVSEG*/
409 
410 #endif /* INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_H */
static void volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t *scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h:86
static void volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t *scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h:298
static void volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_u_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t *scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h:168
static void volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t *scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h:123
static void volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t *scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h:211
static void volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_a_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t *scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h:256
static __m256 _mm256_complexconjugatemul_ps(const __m256 x, const __m256 y)
Definition: volk_avx_intrinsics.h:76
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
#define lv_cimag(x)
Definition: volk_complex.h:98
#define lv_conj(x)
Definition: volk_complex.h:100
#define lv_creal(x)
Definition: volk_complex.h:96
float complex lv_32fc_t
Definition: volk_complex.h:74
for i
Definition: volk_config_fixed.tmpl.h:13
static __m128 _mm_complexconjugatemul_ps(__m128 x, __m128 y)
Definition: volk_sse3_intrinsics.h:31