Vector Optimized Library of Kernels  3.1.2
Architecture-tuned implementations of math kernels
volk_32fc_x2_square_dist_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
65 #ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a_H
66 #define INCLUDED_volk_32fc_x2_square_dist_32f_a_H
67 
68 #include <inttypes.h>
69 #include <stdio.h>
70 #include <volk/volk_complex.h>
71 
72 #ifdef LV_HAVE_AVX2
73 #include <immintrin.h>
74 
75 static inline void volk_32fc_x2_square_dist_32f_a_avx2(float* target,
76  lv_32fc_t* src0,
77  lv_32fc_t* points,
78  unsigned int num_points)
79 {
80  const unsigned int num_bytes = num_points * 8;
81  __m128 xmm0, xmm9, xmm10;
82  __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
83 
84  lv_32fc_t diff;
85  float sq_dist;
86  int bound = num_bytes >> 6;
87  int leftovers0 = (num_bytes >> 5) & 1;
88  int leftovers1 = (num_bytes >> 4) & 1;
89  int leftovers2 = (num_bytes >> 3) & 1;
90  int i = 0;
91 
92  __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
93  xmm1 = _mm256_setzero_ps();
94  xmm0 = _mm_load_ps((float*)src0);
95  xmm0 = _mm_permute_ps(xmm0, 0b01000100);
96  xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
97  xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
98 
99  for (; i < bound; ++i) {
100  xmm2 = _mm256_load_ps((float*)&points[0]);
101  xmm3 = _mm256_load_ps((float*)&points[4]);
102  points += 8;
103 
104  xmm4 = _mm256_sub_ps(xmm1, xmm2);
105  xmm5 = _mm256_sub_ps(xmm1, xmm3);
106  xmm6 = _mm256_mul_ps(xmm4, xmm4);
107  xmm7 = _mm256_mul_ps(xmm5, xmm5);
108 
109  xmm4 = _mm256_hadd_ps(xmm6, xmm7);
110  xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
111 
112  _mm256_store_ps(target, xmm4);
113 
114  target += 8;
115  }
116 
117  for (i = 0; i < leftovers0; ++i) {
118 
119  xmm2 = _mm256_load_ps((float*)&points[0]);
120 
121  xmm4 = _mm256_sub_ps(xmm1, xmm2);
122 
123  points += 4;
124 
125  xmm6 = _mm256_mul_ps(xmm4, xmm4);
126 
127  xmm4 = _mm256_hadd_ps(xmm6, xmm6);
128  xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
129 
130  xmm9 = _mm256_extractf128_ps(xmm4, 1);
131  _mm_store_ps(target, xmm9);
132 
133  target += 4;
134  }
135 
136  for (i = 0; i < leftovers1; ++i) {
137  xmm9 = _mm_load_ps((float*)&points[0]);
138 
139  xmm10 = _mm_sub_ps(xmm0, xmm9);
140 
141  points += 2;
142 
143  xmm9 = _mm_mul_ps(xmm10, xmm10);
144 
145  xmm10 = _mm_hadd_ps(xmm9, xmm9);
146 
147  _mm_storeh_pi((__m64*)target, xmm10);
148 
149  target += 2;
150  }
151 
152  for (i = 0; i < leftovers2; ++i) {
153 
154  diff = src0[0] - points[0];
155 
156  sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
157 
158  target[0] = sq_dist;
159  }
160 }
161 
162 #endif /*LV_HAVE_AVX2*/
163 
164 #ifdef LV_HAVE_SSE3
165 #include <pmmintrin.h>
166 #include <xmmintrin.h>
167 
168 static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target,
169  lv_32fc_t* src0,
170  lv_32fc_t* points,
171  unsigned int num_points)
172 {
173  const unsigned int num_bytes = num_points * 8;
174 
175  __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
176 
177  lv_32fc_t diff;
178  float sq_dist;
179  int bound = num_bytes >> 5;
180  int i = 0;
181 
182  xmm1 = _mm_setzero_ps();
183  xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
184  xmm1 = _mm_movelh_ps(xmm1, xmm1);
185 
186  for (; i < bound; ++i) {
187  xmm2 = _mm_load_ps((float*)&points[0]);
188  xmm4 = _mm_sub_ps(xmm1, xmm2);
189  xmm3 = _mm_load_ps((float*)&points[2]);
190  xmm5 = _mm_sub_ps(xmm1, xmm3);
191 
192  xmm6 = _mm_mul_ps(xmm4, xmm4);
193  xmm7 = _mm_mul_ps(xmm5, xmm5);
194 
195  xmm4 = _mm_hadd_ps(xmm6, xmm7);
196 
197  _mm_store_ps(target, xmm4);
198 
199  points += 4;
200  target += 4;
201  }
202 
203  if (num_bytes >> 4 & 1) {
204 
205  xmm2 = _mm_load_ps((float*)&points[0]);
206 
207  xmm4 = _mm_sub_ps(xmm1, xmm2);
208 
209  points += 2;
210 
211  xmm6 = _mm_mul_ps(xmm4, xmm4);
212 
213  xmm4 = _mm_hadd_ps(xmm6, xmm6);
214 
215  _mm_storeh_pi((__m64*)target, xmm4);
216 
217  target += 2;
218  }
219 
220  if (num_bytes >> 3 & 1) {
221 
222  diff = src0[0] - points[0];
223 
224  sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
225 
226  target[0] = sq_dist;
227  }
228 }
229 
230 #endif /*LV_HAVE_SSE3*/
231 
232 
233 #ifdef LV_HAVE_NEON
234 #include <arm_neon.h>
235 static inline void volk_32fc_x2_square_dist_32f_neon(float* target,
236  lv_32fc_t* src0,
237  lv_32fc_t* points,
238  unsigned int num_points)
239 {
240  const unsigned int quarter_points = num_points / 4;
241  unsigned int number;
242 
243  float32x4x2_t a_vec, b_vec;
244  float32x4x2_t diff_vec;
245  float32x4_t tmp, tmp1, dist_sq;
246  a_vec.val[0] = vdupq_n_f32(lv_creal(src0[0]));
247  a_vec.val[1] = vdupq_n_f32(lv_cimag(src0[0]));
248  for (number = 0; number < quarter_points; ++number) {
249  b_vec = vld2q_f32((float*)points);
250  diff_vec.val[0] = vsubq_f32(a_vec.val[0], b_vec.val[0]);
251  diff_vec.val[1] = vsubq_f32(a_vec.val[1], b_vec.val[1]);
252  tmp = vmulq_f32(diff_vec.val[0], diff_vec.val[0]);
253  tmp1 = vmulq_f32(diff_vec.val[1], diff_vec.val[1]);
254 
255  dist_sq = vaddq_f32(tmp, tmp1);
256  vst1q_f32(target, dist_sq);
257  points += 4;
258  target += 4;
259  }
260  for (number = quarter_points * 4; number < num_points; ++number) {
261  lv_32fc_t diff = src0[0] - *points++;
262  *target++ = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
263  }
264 }
265 #endif /* LV_HAVE_NEON */
266 
267 
268 #ifdef LV_HAVE_GENERIC
269 static inline void volk_32fc_x2_square_dist_32f_generic(float* target,
270  lv_32fc_t* src0,
271  lv_32fc_t* points,
272  unsigned int num_points)
273 {
274  const unsigned int num_bytes = num_points * 8;
275 
276  lv_32fc_t diff;
277  float sq_dist;
278  unsigned int i = 0;
279 
280  for (; i<num_bytes>> 3; ++i) {
281  diff = src0[0] - points[i];
282 
283  sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
284 
285  target[i] = sq_dist;
286  }
287 }
288 
289 #endif /*LV_HAVE_GENERIC*/
290 
291 
292 #endif /*INCLUDED_volk_32fc_x2_square_dist_32f_a_H*/
293 
294 #ifndef INCLUDED_volk_32fc_x2_square_dist_32f_u_H
295 #define INCLUDED_volk_32fc_x2_square_dist_32f_u_H
296 
297 #include <inttypes.h>
298 #include <stdio.h>
299 #include <volk/volk_complex.h>
300 
301 #ifdef LV_HAVE_AVX2
302 #include <immintrin.h>
303 
304 static inline void volk_32fc_x2_square_dist_32f_u_avx2(float* target,
305  lv_32fc_t* src0,
306  lv_32fc_t* points,
307  unsigned int num_points)
308 {
309  const unsigned int num_bytes = num_points * 8;
310  __m128 xmm0, xmm9;
311  __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
312 
313  lv_32fc_t diff;
314  float sq_dist;
315  int bound = num_bytes >> 6;
316  int leftovers1 = (num_bytes >> 3) & 0b11;
317  int i = 0;
318 
319  __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
320  xmm1 = _mm256_setzero_ps();
321  xmm0 = _mm_loadu_ps((float*)src0);
322  xmm0 = _mm_permute_ps(xmm0, 0b01000100);
323  xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
324  xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
325 
326  for (; i < bound; ++i) {
327  xmm2 = _mm256_loadu_ps((float*)&points[0]);
328  xmm3 = _mm256_loadu_ps((float*)&points[4]);
329  points += 8;
330 
331  xmm4 = _mm256_sub_ps(xmm1, xmm2);
332  xmm5 = _mm256_sub_ps(xmm1, xmm3);
333  xmm6 = _mm256_mul_ps(xmm4, xmm4);
334  xmm7 = _mm256_mul_ps(xmm5, xmm5);
335 
336  xmm4 = _mm256_hadd_ps(xmm6, xmm7);
337  xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
338 
339  _mm256_storeu_ps(target, xmm4);
340 
341  target += 8;
342  }
343 
344  if (num_bytes >> 5 & 1) {
345 
346  xmm2 = _mm256_loadu_ps((float*)&points[0]);
347 
348  xmm4 = _mm256_sub_ps(xmm1, xmm2);
349 
350  points += 4;
351 
352  xmm6 = _mm256_mul_ps(xmm4, xmm4);
353 
354  xmm4 = _mm256_hadd_ps(xmm6, xmm6);
355  xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
356 
357  xmm9 = _mm256_extractf128_ps(xmm4, 1);
358  _mm_storeu_ps(target, xmm9);
359 
360  target += 4;
361  }
362 
363  for (i = 0; i < leftovers1; ++i) {
364 
365  diff = src0[0] - points[0];
366  points += 1;
367 
368  sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
369 
370  target[0] = sq_dist;
371  target += 1;
372  }
373 }
374 
375 #endif /*LV_HAVE_AVX2*/
376 
377 #endif /*INCLUDED_volk_32fc_x2_square_dist_32f_u_H*/
static void volk_32fc_x2_square_dist_32f_generic(float *target, lv_32fc_t *src0, lv_32fc_t *points, unsigned int num_points)
Definition: volk_32fc_x2_square_dist_32f.h:269
static void volk_32fc_x2_square_dist_32f_neon(float *target, lv_32fc_t *src0, lv_32fc_t *points, unsigned int num_points)
Definition: volk_32fc_x2_square_dist_32f.h:235
static void volk_32fc_x2_square_dist_32f_a_sse3(float *target, lv_32fc_t *src0, lv_32fc_t *points, unsigned int num_points)
Definition: volk_32fc_x2_square_dist_32f.h:168
#define lv_cimag(x)
Definition: volk_complex.h:98
#define lv_creal(x)
Definition: volk_complex.h:96
float complex lv_32fc_t
Definition: volk_complex.h:74
for i
Definition: volk_config_fixed.tmpl.h:13