Vector Optimized Library of Kernels  3.1.2
Architecture-tuned implementations of math kernels
volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2019 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
74 #ifndef INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_H
75 #define INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_H
76 
77 #include <float.h>
78 #include <inttypes.h>
79 #include <stdio.h>
80 #include <volk/volk_complex.h>
81 
82 
83 #ifdef LV_HAVE_GENERIC
84 
85 static inline void
87  const lv_32fc_t* aVector,
88  const lv_32fc_t* bVector,
89  const lv_32fc_t* scalar,
90  unsigned int num_points)
91 {
92  const lv_32fc_t* aPtr = aVector;
93  const lv_32fc_t* bPtr = bVector;
94  lv_32fc_t* cPtr = cVector;
95  unsigned int number = num_points;
96 
97  // unwrap loop
98  while (number >= 8) {
99  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
100  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
101  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
102  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
103  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
104  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
105  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
106  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
107  number -= 8;
108  }
109 
110  // clean up any remaining
111  while (number-- > 0) {
112  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
113  }
114 }
115 #endif /* LV_HAVE_GENERIC */
116 
117 
118 #ifdef LV_HAVE_AVX
119 #include <immintrin.h>
121 
122 static inline void
124  const lv_32fc_t* aVector,
125  const lv_32fc_t* bVector,
126  const lv_32fc_t* scalar,
127  unsigned int num_points)
128 {
129  unsigned int number = 0;
130  unsigned int i = 0;
131  const unsigned int quarterPoints = num_points / 4;
132  unsigned int isodd = num_points & 3;
133 
134  __m256 x, y, s, z;
135  lv_32fc_t v_scalar[4] = { *scalar, *scalar, *scalar, *scalar };
136 
137  const lv_32fc_t* a = aVector;
138  const lv_32fc_t* b = bVector;
139  lv_32fc_t* c = cVector;
140 
141  // Set up constant scalar vector
142  s = _mm256_loadu_ps((float*)v_scalar);
143 
144  for (; number < quarterPoints; number++) {
145  x = _mm256_loadu_ps((float*)b);
146  y = _mm256_loadu_ps((float*)a);
148  z = _mm256_add_ps(y, z);
149  _mm256_storeu_ps((float*)c, z);
150 
151  a += 4;
152  b += 4;
153  c += 4;
154  }
155 
156  for (i = num_points - isodd; i < num_points; i++) {
157  *c++ = (*a++) + lv_conj(*b++) * (*scalar);
158  }
159 }
160 #endif /* LV_HAVE_AVX */
161 
162 
163 #ifdef LV_HAVE_SSE3
164 #include <pmmintrin.h>
166 
167 static inline void
169  const lv_32fc_t* aVector,
170  const lv_32fc_t* bVector,
171  const lv_32fc_t* scalar,
172  unsigned int num_points)
173 {
174  unsigned int number = 0;
175  const unsigned int halfPoints = num_points / 2;
176 
177  __m128 x, y, s, z;
178  lv_32fc_t v_scalar[2] = { *scalar, *scalar };
179 
180  const lv_32fc_t* a = aVector;
181  const lv_32fc_t* b = bVector;
182  lv_32fc_t* c = cVector;
183 
184  // Set up constant scalar vector
185  s = _mm_loadu_ps((float*)v_scalar);
186 
187  for (; number < halfPoints; number++) {
188  x = _mm_loadu_ps((float*)b);
189  y = _mm_loadu_ps((float*)a);
190  z = _mm_complexconjugatemul_ps(s, x);
191  z = _mm_add_ps(y, z);
192  _mm_storeu_ps((float*)c, z);
193 
194  a += 2;
195  b += 2;
196  c += 2;
197  }
198 
199  if ((num_points % 2) != 0) {
200  *c = *a + lv_conj(*b) * (*scalar);
201  }
202 }
203 #endif /* LV_HAVE_SSE */
204 
205 
206 #ifdef LV_HAVE_AVX
207 #include <immintrin.h>
209 
210 static inline void
212  const lv_32fc_t* aVector,
213  const lv_32fc_t* bVector,
214  const lv_32fc_t* scalar,
215  unsigned int num_points)
216 {
217  unsigned int number = 0;
218  unsigned int i = 0;
219  const unsigned int quarterPoints = num_points / 4;
220  unsigned int isodd = num_points & 3;
221 
222  __m256 x, y, s, z;
223  lv_32fc_t v_scalar[4] = { *scalar, *scalar, *scalar, *scalar };
224 
225  const lv_32fc_t* a = aVector;
226  const lv_32fc_t* b = bVector;
227  lv_32fc_t* c = cVector;
228 
229  // Set up constant scalar vector
230  s = _mm256_loadu_ps((float*)v_scalar);
231 
232  for (; number < quarterPoints; number++) {
233  x = _mm256_load_ps((float*)b);
234  y = _mm256_load_ps((float*)a);
236  z = _mm256_add_ps(y, z);
237  _mm256_store_ps((float*)c, z);
238 
239  a += 4;
240  b += 4;
241  c += 4;
242  }
243 
244  for (i = num_points - isodd; i < num_points; i++) {
245  *c++ = (*a++) + lv_conj(*b++) * (*scalar);
246  }
247 }
248 #endif /* LV_HAVE_AVX */
249 
250 
251 #ifdef LV_HAVE_SSE3
252 #include <pmmintrin.h>
254 
255 static inline void
257  const lv_32fc_t* aVector,
258  const lv_32fc_t* bVector,
259  const lv_32fc_t* scalar,
260  unsigned int num_points)
261 {
262  unsigned int number = 0;
263  const unsigned int halfPoints = num_points / 2;
264 
265  __m128 x, y, s, z;
266  lv_32fc_t v_scalar[2] = { *scalar, *scalar };
267 
268  const lv_32fc_t* a = aVector;
269  const lv_32fc_t* b = bVector;
270  lv_32fc_t* c = cVector;
271 
272  // Set up constant scalar vector
273  s = _mm_loadu_ps((float*)v_scalar);
274 
275  for (; number < halfPoints; number++) {
276  x = _mm_load_ps((float*)b);
277  y = _mm_load_ps((float*)a);
278  z = _mm_complexconjugatemul_ps(s, x);
279  z = _mm_add_ps(y, z);
280  _mm_store_ps((float*)c, z);
281 
282  a += 2;
283  b += 2;
284  c += 2;
285  }
286 
287  if ((num_points % 2) != 0) {
288  *c = *a + lv_conj(*b) * (*scalar);
289  }
290 }
291 #endif /* LV_HAVE_SSE */
292 
293 
294 #ifdef LV_HAVE_NEON
295 #include <arm_neon.h>
296 
297 static inline void
299  const lv_32fc_t* aVector,
300  const lv_32fc_t* bVector,
301  const lv_32fc_t* scalar,
302  unsigned int num_points)
303 {
304  const lv_32fc_t* bPtr = bVector;
305  const lv_32fc_t* aPtr = aVector;
306  lv_32fc_t* cPtr = cVector;
307  unsigned int number = num_points;
308  unsigned int quarter_points = num_points / 4;
309 
310  float32x4x2_t a_val, b_val, c_val, scalar_val;
311  float32x4x2_t tmp_val;
312 
313  scalar_val.val[0] = vld1q_dup_f32((const float*)scalar);
314  scalar_val.val[1] = vld1q_dup_f32(((const float*)scalar) + 1);
315 
316  for (number = 0; number < quarter_points; ++number) {
317  a_val = vld2q_f32((float*)aPtr);
318  b_val = vld2q_f32((float*)bPtr);
319  b_val.val[1] = vnegq_f32(b_val.val[1]);
320  __VOLK_PREFETCH(aPtr + 8);
321  __VOLK_PREFETCH(bPtr + 8);
322 
323  tmp_val.val[1] = vmulq_f32(b_val.val[1], scalar_val.val[0]);
324  tmp_val.val[0] = vmulq_f32(b_val.val[0], scalar_val.val[0]);
325 
326  tmp_val.val[1] = vmlaq_f32(tmp_val.val[1], b_val.val[0], scalar_val.val[1]);
327  tmp_val.val[0] = vmlsq_f32(tmp_val.val[0], b_val.val[1], scalar_val.val[1]);
328 
329  c_val.val[1] = vaddq_f32(a_val.val[1], tmp_val.val[1]);
330  c_val.val[0] = vaddq_f32(a_val.val[0], tmp_val.val[0]);
331 
332  vst2q_f32((float*)cPtr, c_val);
333 
334  aPtr += 4;
335  bPtr += 4;
336  cPtr += 4;
337  }
338 
339  for (number = quarter_points * 4; number < num_points; number++) {
340  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
341  }
342 }
343 #endif /* LV_HAVE_NEON */
344 
345 #endif /* INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_H */
static void volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t *scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h:86
static void volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t *scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h:298
static void volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_u_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t *scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h:168
static void volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t *scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h:123
static void volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t *scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h:211
static void volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_a_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t *scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h:256
static __m256 _mm256_complexconjugatemul_ps(const __m256 x, const __m256 y)
Definition: volk_avx_intrinsics.h:76
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
#define lv_conj(x)
Definition: volk_complex.h:100
float complex lv_32fc_t
Definition: volk_complex.h:74
for i
Definition: volk_config_fixed.tmpl.h:13
static __m128 _mm_complexconjugatemul_ps(__m128 x, __m128 y)
Definition: volk_sse3_intrinsics.h:31