Vector Optimized Library of Kernels  2.4
Architecture-tuned implementations of math kernels
volk_32fc_x2_divide_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2016 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
72 #ifndef INCLUDED_volk_32fc_x2_divide_32fc_u_H
73 #define INCLUDED_volk_32fc_x2_divide_32fc_u_H
74 
75 #include <float.h>
76 #include <inttypes.h>
77 #include <volk/volk_complex.h>
78 
79 
80 #ifdef LV_HAVE_GENERIC
81 
82 static inline void volk_32fc_x2_divide_32fc_generic(lv_32fc_t* cVector,
83  const lv_32fc_t* aVector,
84  const lv_32fc_t* bVector,
85  unsigned int num_points)
86 {
87  lv_32fc_t* cPtr = cVector;
88  const lv_32fc_t* aPtr = aVector;
89  const lv_32fc_t* bPtr = bVector;
90 
91  for (unsigned int number = 0; number < num_points; number++) {
92  *cPtr++ = (*aPtr++) / (*bPtr++);
93  }
94 }
95 #endif /* LV_HAVE_GENERIC */
96 
97 
98 #ifdef LV_HAVE_SSE3
99 #include <pmmintrin.h>
101 
102 static inline void volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t* cVector,
103  const lv_32fc_t* numeratorVector,
104  const lv_32fc_t* denumeratorVector,
105  unsigned int num_points)
106 {
107  /*
108  * we'll do the "classical"
109  * a a b*
110  * --- = -------
111  * b |b|^2
112  * */
113  unsigned int number = 0;
114  const unsigned int quarterPoints = num_points / 4;
115 
116  __m128 num01, num23, den01, den23, norm, result;
117  lv_32fc_t* c = cVector;
118  const lv_32fc_t* a = numeratorVector;
119  const lv_32fc_t* b = denumeratorVector;
120 
121  for (; number < quarterPoints; number++) {
122  num01 = _mm_loadu_ps((float*)a); // first pair
123  den01 = _mm_loadu_ps((float*)b); // first pair
124  num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b)
125  a += 2;
126  b += 2;
127 
128  num23 = _mm_loadu_ps((float*)a); // second pair
129  den23 = _mm_loadu_ps((float*)b); // second pair
130  num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b)
131  a += 2;
132  b += 2;
133 
134  norm = _mm_magnitudesquared_ps_sse3(den01, den23);
135  den01 = _mm_unpacklo_ps(norm, norm);
136  den23 = _mm_unpackhi_ps(norm, norm);
137 
138  result = _mm_div_ps(num01, den01);
139  _mm_storeu_ps((float*)c, result); // Store the results back into the C container
140  c += 2;
141  result = _mm_div_ps(num23, den23);
142  _mm_storeu_ps((float*)c, result); // Store the results back into the C container
143  c += 2;
144  }
145 
146  number *= 4;
147  for (; number < num_points; number++) {
148  *c = (*a) / (*b);
149  a++;
150  b++;
151  c++;
152  }
153 }
154 #endif /* LV_HAVE_SSE3 */
155 
156 
157 #ifdef LV_HAVE_AVX
158 #include <immintrin.h>
160 
161 static inline void volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t* cVector,
162  const lv_32fc_t* numeratorVector,
163  const lv_32fc_t* denumeratorVector,
164  unsigned int num_points)
165 {
166  /*
167  * we'll do the "classical"
168  * a a b*
169  * --- = -------
170  * b |b|^2
171  * */
172  unsigned int number = 0;
173  const unsigned int quarterPoints = num_points / 4;
174 
175  __m256 num, denum, mul_conj, sq, mag_sq, mag_sq_un, div;
176  lv_32fc_t* c = cVector;
177  const lv_32fc_t* a = numeratorVector;
178  const lv_32fc_t* b = denumeratorVector;
179 
180  for (; number < quarterPoints; number++) {
181  num = _mm256_loadu_ps(
182  (float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
183  denum = _mm256_loadu_ps(
184  (float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
185  mul_conj = _mm256_complexconjugatemul_ps(num, denum);
186  sq = _mm256_mul_ps(denum, denum); // Square the values
187  mag_sq_un = _mm256_hadd_ps(
188  sq, sq); // obtain the actual squared magnitude, although out of order
189  mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8); // I order them
190  // best guide I found on using these functions:
191  // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=2738,2059,2738,2738,3875,3874,3875,2738,3870
192  div = _mm256_div_ps(mul_conj, mag_sq);
193 
194  _mm256_storeu_ps((float*)c, div); // Store the results back into the C container
195 
196  a += 4;
197  b += 4;
198  c += 4;
199  }
200 
201  number = quarterPoints * 4;
202 
203  for (; number < num_points; number++) {
204  *c++ = (*a++) / (*b++);
205  }
206 }
207 #endif /* LV_HAVE_AVX */
208 
209 
210 #endif /* INCLUDED_volk_32fc_x2_divide_32fc_u_H */
211 
212 
213 #ifndef INCLUDED_volk_32fc_x2_divide_32fc_a_H
214 #define INCLUDED_volk_32fc_x2_divide_32fc_a_H
215 
216 #include <float.h>
217 #include <inttypes.h>
218 #include <stdio.h>
219 #include <volk/volk_complex.h>
220 
221 #ifdef LV_HAVE_SSE3
222 #include <pmmintrin.h>
224 
225 static inline void volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t* cVector,
226  const lv_32fc_t* numeratorVector,
227  const lv_32fc_t* denumeratorVector,
228  unsigned int num_points)
229 {
230  /*
231  * we'll do the "classical"
232  * a a b*
233  * --- = -------
234  * b |b|^2
235  * */
236  unsigned int number = 0;
237  const unsigned int quarterPoints = num_points / 4;
238 
239  __m128 num01, num23, den01, den23, norm, result;
240  lv_32fc_t* c = cVector;
241  const lv_32fc_t* a = numeratorVector;
242  const lv_32fc_t* b = denumeratorVector;
243 
244  for (; number < quarterPoints; number++) {
245  num01 = _mm_load_ps((float*)a); // first pair
246  den01 = _mm_load_ps((float*)b); // first pair
247  num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b)
248  a += 2;
249  b += 2;
250 
251  num23 = _mm_load_ps((float*)a); // second pair
252  den23 = _mm_load_ps((float*)b); // second pair
253  num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b)
254  a += 2;
255  b += 2;
256 
257  norm = _mm_magnitudesquared_ps_sse3(den01, den23);
258 
259  den01 = _mm_unpacklo_ps(norm, norm); // select the lower floats twice
260  den23 = _mm_unpackhi_ps(norm, norm); // select the upper floats twice
261 
262  result = _mm_div_ps(num01, den01);
263  _mm_store_ps((float*)c, result); // Store the results back into the C container
264  c += 2;
265  result = _mm_div_ps(num23, den23);
266  _mm_store_ps((float*)c, result); // Store the results back into the C container
267  c += 2;
268  }
269 
270  number *= 4;
271  for (; number < num_points; number++) {
272  *c = (*a) / (*b);
273  a++;
274  b++;
275  c++;
276  }
277 }
278 #endif /* LV_HAVE_SSE */
279 
280 #ifdef LV_HAVE_AVX
281 #include <immintrin.h>
283 
284 static inline void volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t* cVector,
285  const lv_32fc_t* numeratorVector,
286  const lv_32fc_t* denumeratorVector,
287  unsigned int num_points)
288 {
289  /*
290  * Guide to AVX intrisics:
291  * https://software.intel.com/sites/landingpage/IntrinsicsGuide/#
292  *
293  * we'll do the "classical"
294  * a a b*
295  * --- = -------
296  * b |b|^2
297  *
298  */
299  lv_32fc_t* c = cVector;
300  const lv_32fc_t* a = numeratorVector;
301  const lv_32fc_t* b = denumeratorVector;
302 
303  const unsigned int eigthPoints = num_points / 8;
304 
305  __m256 num01, num23, denum01, denum23, complex_result, result0, result1;
306 
307  for (unsigned int number = 0; number < eigthPoints; number++) {
308  // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
309  num01 = _mm256_load_ps((float*)a);
310  denum01 = _mm256_load_ps((float*)b);
311 
312  num01 = _mm256_complexconjugatemul_ps(num01, denum01);
313  a += 4;
314  b += 4;
315 
316  num23 = _mm256_load_ps((float*)a);
317  denum23 = _mm256_load_ps((float*)b);
318  num23 = _mm256_complexconjugatemul_ps(num23, denum23);
319  a += 4;
320  b += 4;
321 
322  complex_result = _mm256_hadd_ps(_mm256_mul_ps(denum01, denum01),
323  _mm256_mul_ps(denum23, denum23));
324 
325  denum01 = _mm256_shuffle_ps(complex_result, complex_result, 0x50);
326  denum23 = _mm256_shuffle_ps(complex_result, complex_result, 0xfa);
327 
328  result0 = _mm256_div_ps(num01, denum01);
329  result1 = _mm256_div_ps(num23, denum23);
330 
331  _mm256_store_ps((float*)c, result0);
332  c += 4;
333  _mm256_store_ps((float*)c, result1);
334  c += 4;
335  }
336 
337  volk_32fc_x2_divide_32fc_generic(c, a, b, num_points - eigthPoints * 8);
338 }
339 #endif /* LV_HAVE_AVX */
340 
341 
342 #ifdef LV_HAVE_NEON
343 #include <arm_neon.h>
344 
345 static inline void volk_32fc_x2_divide_32fc_neon(lv_32fc_t* cVector,
346  const lv_32fc_t* aVector,
347  const lv_32fc_t* bVector,
348  unsigned int num_points)
349 {
350  lv_32fc_t* cPtr = cVector;
351  const lv_32fc_t* aPtr = aVector;
352  const lv_32fc_t* bPtr = bVector;
353 
354  float32x4x2_t aVal, bVal, cVal;
355  float32x4_t bAbs, bAbsInv;
356 
357  const unsigned int quarterPoints = num_points / 4;
358  unsigned int number = 0;
359  for (; number < quarterPoints; number++) {
360  aVal = vld2q_f32((const float*)(aPtr));
361  bVal = vld2q_f32((const float*)(bPtr));
362  aPtr += 4;
363  bPtr += 4;
364  __VOLK_PREFETCH(aPtr + 4);
365  __VOLK_PREFETCH(bPtr + 4);
366 
367  bAbs = vmulq_f32(bVal.val[0], bVal.val[0]);
368  bAbs = vmlaq_f32(bAbs, bVal.val[1], bVal.val[1]);
369 
370  bAbsInv = vrecpeq_f32(bAbs);
371  bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
372  bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
373 
374  cVal.val[0] = vmulq_f32(aVal.val[0], bVal.val[0]);
375  cVal.val[0] = vmlaq_f32(cVal.val[0], aVal.val[1], bVal.val[1]);
376  cVal.val[0] = vmulq_f32(cVal.val[0], bAbsInv);
377 
378  cVal.val[1] = vmulq_f32(aVal.val[1], bVal.val[0]);
379  cVal.val[1] = vmlsq_f32(cVal.val[1], aVal.val[0], bVal.val[1]);
380  cVal.val[1] = vmulq_f32(cVal.val[1], bAbsInv);
381 
382  vst2q_f32((float*)(cPtr), cVal);
383  cPtr += 4;
384  }
385 
386  for (number = quarterPoints * 4; number < num_points; number++) {
387  *cPtr++ = (*aPtr++) / (*bPtr++);
388  }
389 }
390 #endif /* LV_HAVE_NEON */
391 
392 
393 #endif /* INCLUDED_volk_32fc_x2_divide_32fc_a_H */
static void volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:284
static __m128 _mm_magnitudesquared_ps_sse3(__m128 cplxValue1, __m128 cplxValue2)
Definition: volk_sse3_intrinsics.h:51
static void volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:225
static __m256 _mm256_complexconjugatemul_ps(const __m256 x, const __m256 y)
Definition: volk_avx_intrinsics.h:51
static __m128 _mm_complexconjugatemul_ps(__m128 x, __m128 y)
Definition: volk_sse3_intrinsics.h:44
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
float complex lv_32fc_t
Definition: volk_complex.h:70
static void volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:102
static void volk_32fc_x2_divide_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:345
static void volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:161
static void volk_32fc_x2_divide_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:82