Vector Optimized Library of Kernels  3.2.0
Architecture-tuned implementations of math kernels
volk_32fc_index_max_16u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014-2016, 2018-2020 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
63 #ifndef INCLUDED_volk_32fc_index_max_16u_a_H
64 #define INCLUDED_volk_32fc_index_max_16u_a_H
65 
66 #include <inttypes.h>
67 #include <limits.h>
68 #include <stdio.h>
69 #include <volk/volk_common.h>
70 #include <volk/volk_complex.h>
71 
72 #ifdef LV_HAVE_AVX2
73 #include <immintrin.h>
75 
76 static inline void volk_32fc_index_max_16u_a_avx2_variant_0(uint16_t* target,
77  const lv_32fc_t* src0,
78  uint32_t num_points)
79 {
80  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
81 
82  const __m256i indices_increment = _mm256_set1_epi32(8);
83  /*
84  * At the start of each loop iteration current_indices holds the indices of
85  * the complex numbers loaded from memory. Explanation for odd order is given
86  * in implementation of vector_32fc_index_max_variant0().
87  */
88  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
89 
90  __m256 max_values = _mm256_setzero_ps();
91  __m256i max_indices = _mm256_setzero_si256();
92 
93  for (unsigned i = 0; i < num_points / 8u; ++i) {
94  __m256 in0 = _mm256_load_ps((float*)src0);
95  __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
97  in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
98  src0 += 8;
99  }
100 
101  // determine maximum value and index in the result of the vectorized loop
102  __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
103  __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
104  _mm256_store_ps(max_values_buffer, max_values);
105  _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
106 
107  float max = 0.f;
108  uint32_t index = 0;
109  for (unsigned i = 0; i < 8; i++) {
110  if (max_values_buffer[i] > max) {
111  max = max_values_buffer[i];
112  index = max_indices_buffer[i];
113  }
114  }
115 
116  // handle tail not processed by the vectorized loop
117  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
118  const float abs_squared =
119  lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
120  if (abs_squared > max) {
121  max = abs_squared;
122  index = i;
123  }
124  ++src0;
125  }
126 
127  *target = index;
128 }
129 
130 #endif /*LV_HAVE_AVX2*/
131 
132 #ifdef LV_HAVE_AVX2
133 #include <immintrin.h>
135 
136 static inline void volk_32fc_index_max_16u_a_avx2_variant_1(uint16_t* target,
137  const lv_32fc_t* src0,
138  uint32_t num_points)
139 {
140  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
141 
142  const __m256i indices_increment = _mm256_set1_epi32(8);
143  /*
144  * At the start of each loop iteration current_indices holds the indices of
145  * the complex numbers loaded from memory. Explanation for odd order is given
146  * in implementation of vector_32fc_index_max_variant0().
147  */
148  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
149 
150  __m256 max_values = _mm256_setzero_ps();
151  __m256i max_indices = _mm256_setzero_si256();
152 
153  for (unsigned i = 0; i < num_points / 8u; ++i) {
154  __m256 in0 = _mm256_load_ps((float*)src0);
155  __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
157  in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
158  src0 += 8;
159  }
160 
161  // determine maximum value and index in the result of the vectorized loop
162  __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
163  __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
164  _mm256_store_ps(max_values_buffer, max_values);
165  _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
166 
167  float max = 0.f;
168  uint32_t index = 0;
169  for (unsigned i = 0; i < 8; i++) {
170  if (max_values_buffer[i] > max) {
171  max = max_values_buffer[i];
172  index = max_indices_buffer[i];
173  }
174  }
175 
176  // handle tail not processed by the vectorized loop
177  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
178  const float abs_squared =
179  lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
180  if (abs_squared > max) {
181  max = abs_squared;
182  index = i;
183  }
184  ++src0;
185  }
186 
187  *target = index;
188 }
189 
190 #endif /*LV_HAVE_AVX2*/
191 
192 #ifdef LV_HAVE_SSE3
193 #include <pmmintrin.h>
194 #include <xmmintrin.h>
195 
196 static inline void volk_32fc_index_max_16u_a_sse3(uint16_t* target,
197  const lv_32fc_t* src0,
198  uint32_t num_points)
199 {
200  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
201  const uint32_t num_bytes = num_points * 8;
202 
203  union bit128 holderf;
204  union bit128 holderi;
205  float sq_dist = 0.0;
206 
207  union bit128 xmm5, xmm4;
208  __m128 xmm1, xmm2, xmm3;
209  __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
210 
211  xmm5.int_vec = _mm_setzero_si128();
212  xmm4.int_vec = _mm_setzero_si128();
213  holderf.int_vec = _mm_setzero_si128();
214  holderi.int_vec = _mm_setzero_si128();
215 
216  int bound = num_bytes >> 5;
217  int i = 0;
218 
219  xmm8 = _mm_setr_epi32(0, 1, 2, 3);
220  xmm9 = _mm_setzero_si128();
221  xmm10 = _mm_setr_epi32(4, 4, 4, 4);
222  xmm3 = _mm_setzero_ps();
223 
224  for (; i < bound; ++i) {
225  xmm1 = _mm_load_ps((float*)src0);
226  xmm2 = _mm_load_ps((float*)&src0[2]);
227 
228  src0 += 4;
229 
230  xmm1 = _mm_mul_ps(xmm1, xmm1);
231  xmm2 = _mm_mul_ps(xmm2, xmm2);
232 
233  xmm1 = _mm_hadd_ps(xmm1, xmm2);
234 
235  xmm3 = _mm_max_ps(xmm1, xmm3);
236 
237  xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
238  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
239 
240  xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
241  xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
242 
243  xmm9 = _mm_add_epi32(xmm11, xmm12);
244 
245  xmm8 = _mm_add_epi32(xmm8, xmm10);
246  }
247 
248  if (num_bytes >> 4 & 1) {
249  xmm2 = _mm_load_ps((float*)src0);
250 
251  xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
252  xmm8 = bit128_p(&xmm1)->int_vec;
253 
254  xmm2 = _mm_mul_ps(xmm2, xmm2);
255 
256  src0 += 2;
257 
258  xmm1 = _mm_hadd_ps(xmm2, xmm2);
259 
260  xmm3 = _mm_max_ps(xmm1, xmm3);
261 
262  xmm10 = _mm_setr_epi32(2, 2, 2, 2);
263 
264  xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
265  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
266 
267  xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
268  xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
269 
270  xmm9 = _mm_add_epi32(xmm11, xmm12);
271 
272  xmm8 = _mm_add_epi32(xmm8, xmm10);
273  }
274 
275  if (num_bytes >> 3 & 1) {
276  sq_dist =
277  lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
278 
279  xmm2 = _mm_load1_ps(&sq_dist);
280 
281  xmm1 = xmm3;
282 
283  xmm3 = _mm_max_ss(xmm3, xmm2);
284 
285  xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
286  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
287 
288  xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
289 
290  xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
291  xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
292 
293  xmm9 = _mm_add_epi32(xmm11, xmm12);
294  }
295 
296  _mm_store_ps((float*)&(holderf.f), xmm3);
297  _mm_store_si128(&(holderi.int_vec), xmm9);
298 
299  target[0] = holderi.i[0];
300  sq_dist = holderf.f[0];
301  target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
302  sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
303  target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
304  sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
305  target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
306  sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
307 }
308 
309 #endif /*LV_HAVE_SSE3*/
310 
311 #ifdef LV_HAVE_GENERIC
312 static inline void volk_32fc_index_max_16u_generic(uint16_t* target,
313  const lv_32fc_t* src0,
314  uint32_t num_points)
315 {
316  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
317 
318  const uint32_t num_bytes = num_points * 8;
319 
320  float sq_dist = 0.0;
321  float max = 0.0;
322  uint16_t index = 0;
323 
324  uint32_t i = 0;
325 
326  for (; i < (num_bytes >> 3); ++i) {
327  sq_dist =
328  lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
329 
330  if (sq_dist > max) {
331  index = i;
332  max = sq_dist;
333  }
334  }
335  target[0] = index;
336 }
337 
338 #endif /*LV_HAVE_GENERIC*/
339 
340 #endif /*INCLUDED_volk_32fc_index_max_16u_a_H*/
341 
342 #ifndef INCLUDED_volk_32fc_index_max_16u_u_H
343 #define INCLUDED_volk_32fc_index_max_16u_u_H
344 
345 #include <inttypes.h>
346 #include <limits.h>
347 #include <stdio.h>
348 #include <volk/volk_common.h>
349 #include <volk/volk_complex.h>
350 
351 #ifdef LV_HAVE_AVX2
352 #include <immintrin.h>
354 
355 static inline void volk_32fc_index_max_16u_u_avx2_variant_0(uint16_t* target,
356  const lv_32fc_t* src0,
357  uint32_t num_points)
358 {
359  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
360 
361  const __m256i indices_increment = _mm256_set1_epi32(8);
362  /*
363  * At the start of each loop iteration current_indices holds the indices of
364  * the complex numbers loaded from memory. Explanation for odd order is given
365  * in implementation of vector_32fc_index_max_variant0().
366  */
367  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
368 
369  __m256 max_values = _mm256_setzero_ps();
370  __m256i max_indices = _mm256_setzero_si256();
371 
372  for (unsigned i = 0; i < num_points / 8u; ++i) {
373  __m256 in0 = _mm256_loadu_ps((float*)src0);
374  __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
376  in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
377  src0 += 8;
378  }
379 
380  // determine maximum value and index in the result of the vectorized loop
381  __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
382  __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
383  _mm256_store_ps(max_values_buffer, max_values);
384  _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
385 
386  float max = 0.f;
387  uint32_t index = 0;
388  for (unsigned i = 0; i < 8; i++) {
389  if (max_values_buffer[i] > max) {
390  max = max_values_buffer[i];
391  index = max_indices_buffer[i];
392  }
393  }
394 
395  // handle tail not processed by the vectorized loop
396  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
397  const float abs_squared =
398  lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
399  if (abs_squared > max) {
400  max = abs_squared;
401  index = i;
402  }
403  ++src0;
404  }
405 
406  *target = index;
407 }
408 
409 #endif /*LV_HAVE_AVX2*/
410 
411 #ifdef LV_HAVE_AVX2
412 #include <immintrin.h>
414 
415 static inline void volk_32fc_index_max_16u_u_avx2_variant_1(uint16_t* target,
416  const lv_32fc_t* src0,
417  uint32_t num_points)
418 {
419  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
420 
421  const __m256i indices_increment = _mm256_set1_epi32(8);
422  /*
423  * At the start of each loop iteration current_indices holds the indices of
424  * the complex numbers loaded from memory. Explanation for odd order is given
425  * in implementation of vector_32fc_index_max_variant0().
426  */
427  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
428 
429  __m256 max_values = _mm256_setzero_ps();
430  __m256i max_indices = _mm256_setzero_si256();
431 
432  for (unsigned i = 0; i < num_points / 8u; ++i) {
433  __m256 in0 = _mm256_loadu_ps((float*)src0);
434  __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
436  in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
437  src0 += 8;
438  }
439 
440  // determine maximum value and index in the result of the vectorized loop
441  __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
442  __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
443  _mm256_store_ps(max_values_buffer, max_values);
444  _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
445 
446  float max = 0.f;
447  uint32_t index = 0;
448  for (unsigned i = 0; i < 8; i++) {
449  if (max_values_buffer[i] > max) {
450  max = max_values_buffer[i];
451  index = max_indices_buffer[i];
452  }
453  }
454 
455  // handle tail not processed by the vectorized loop
456  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
457  const float abs_squared =
458  lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
459  if (abs_squared > max) {
460  max = abs_squared;
461  index = i;
462  }
463  ++src0;
464  }
465 
466  *target = index;
467 }
468 
469 #endif /*LV_HAVE_AVX2*/
470 
471 #ifdef LV_HAVE_RVV
472 #include <float.h>
473 #include <riscv_vector.h>
474 
475 static inline void
476 volk_32fc_index_max_16u_rvv(uint16_t* target, const lv_32fc_t* src0, uint32_t num_points)
477 {
478  vfloat32m4_t vmax = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
479  vuint16m2_t vmaxi = __riscv_vmv_v_x_u16m2(0, __riscv_vsetvlmax_e16m2());
480  vuint16m2_t vidx = __riscv_vid_v_u16m2(__riscv_vsetvlmax_e16m2());
481  size_t n = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
482  for (size_t vl; n > 0; n -= vl, src0 += vl) {
483  vl = __riscv_vsetvl_e32m4(n);
484  vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)src0, vl);
485  vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl));
486  vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl));
487  vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl);
488  vbool8_t m = __riscv_vmflt(vmax, v, vl);
489  vmax = __riscv_vfmax_tu(vmax, vmax, v, vl);
490  vmaxi = __riscv_vmerge_tu(vmaxi, vmaxi, vidx, m, vl);
491  vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e16m4());
492  }
493  size_t vl = __riscv_vsetvlmax_e32m4();
494  float max = __riscv_vfmv_f(__riscv_vfredmax(RISCV_SHRINK4(vfmax, f, 32, vmax),
495  __riscv_vfmv_v_f_f32m1(0, 1),
496  __riscv_vsetvlmax_e32m1()));
497  vbool8_t m = __riscv_vmfeq(vmax, max, vl);
498  *target = __riscv_vmv_x(__riscv_vslidedown(vmaxi, __riscv_vfirst(m, vl), vl));
499 }
500 #endif /*LV_HAVE_RVV*/
501 
502 #ifdef LV_HAVE_RVVSEG
503 #include <float.h>
504 #include <riscv_vector.h>
505 
506 static inline void volk_32fc_index_max_16u_rvvseg(uint16_t* target,
507  const lv_32fc_t* src0,
508  uint32_t num_points)
509 {
510  vfloat32m4_t vmax = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
511  vuint16m2_t vmaxi = __riscv_vmv_v_x_u16m2(0, __riscv_vsetvlmax_e16m2());
512  vuint16m2_t vidx = __riscv_vid_v_u16m2(__riscv_vsetvlmax_e16m2());
513  size_t n = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
514  for (size_t vl; n > 0; n -= vl, src0 += vl) {
515  vl = __riscv_vsetvl_e32m4(n);
516  vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)src0, vl);
517  vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0), vi = __riscv_vget_f32m4(vc, 1);
518  vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl);
519  vbool8_t m = __riscv_vmflt(vmax, v, vl);
520  vmax = __riscv_vfmax_tu(vmax, vmax, v, vl);
521  vmaxi = __riscv_vmerge_tu(vmaxi, vmaxi, vidx, m, vl);
522  vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e16m4());
523  }
524  size_t vl = __riscv_vsetvlmax_e32m4();
525  float max = __riscv_vfmv_f(__riscv_vfredmax(RISCV_SHRINK4(vfmax, f, 32, vmax),
526  __riscv_vfmv_v_f_f32m1(0, 1),
527  __riscv_vsetvlmax_e32m1()));
528  vbool8_t m = __riscv_vmfeq(vmax, max, vl);
529  *target = __riscv_vmv_x(__riscv_vslidedown(vmaxi, __riscv_vfirst(m, vl), vl));
530 }
531 #endif /*LV_HAVE_RVVSEG*/
532 
533 #endif /*INCLUDED_volk_32fc_index_max_16u_u_H*/
Definition: volk_common.h:116
float f[4]
Definition: volk_common.h:120
__m128i int_vec
Definition: volk_common.h:128
uint32_t i[4]
Definition: volk_common.h:119
__m128 float_vec
Definition: volk_common.h:124
static void volk_32fc_index_max_16u_generic(uint16_t *target, const lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_16u.h:312
static void volk_32fc_index_max_16u_a_sse3(uint16_t *target, const lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_16u.h:196
static void vector_32fc_index_max_variant1(__m256 in0, __m256 in1, __m256 *max_values, __m256i *max_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:203
static void vector_32fc_index_max_variant0(__m256 in0, __m256 in1, __m256 *max_values, __m256i *max_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:141
#define bit128_p(x)
Definition: volk_common.h:147
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:62
#define lv_cimag(x)
Definition: volk_complex.h:98
#define lv_creal(x)
Definition: volk_complex.h:96
float complex lv_32fc_t
Definition: volk_complex.h:74
for i
Definition: volk_config_fixed.tmpl.h:13
#define RISCV_SHRINK4(op, T, S, v)
Definition: volk_rvv_intrinsics.h:24