70 #ifndef INCLUDED_volk_32fc_index_max_32u_a_H 71 #define INCLUDED_volk_32fc_index_max_32u_a_H 79 #include <immintrin.h> 82 volk_32fc_index_max_32u_a_avx2(uint32_t* target,
lv_32fc_t* src0, uint32_t num_points)
84 const uint32_t num_bytes = num_points * 8;
93 __m256 xmm1, xmm2, xmm3;
94 __m256i xmm8, xmm11, xmm12, xmm9, xmm10;
96 xmm5.
int_vec = _mm256_setzero_si256();
97 xmm4.int_vec = _mm256_setzero_si256();
98 holderf.int_vec = _mm256_setzero_si256();
99 holderi.int_vec = _mm256_setzero_si256();
101 int bound = num_bytes >> 6;
104 xmm8 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
105 xmm9 = _mm256_setzero_si256();
106 xmm10 = _mm256_set1_epi32(8);
107 xmm3 = _mm256_setzero_ps();
108 __m256i idx = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7);
110 for (;
i < bound; ++
i) {
111 xmm1 = _mm256_load_ps((
float*)src0);
112 xmm2 = _mm256_load_ps((
float*)&src0[4]);
116 xmm1 = _mm256_mul_ps(xmm1, xmm1);
117 xmm2 = _mm256_mul_ps(xmm2, xmm2);
119 xmm1 = _mm256_hadd_ps(xmm1, xmm2);
120 xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
122 xmm3 = _mm256_max_ps(xmm1, xmm3);
124 xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
125 xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
127 xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
128 xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
130 xmm9 = _mm256_add_epi32(xmm11, xmm12);
132 xmm8 = _mm256_add_epi32(xmm8, xmm10);
135 _mm256_store_ps((
float*)&(holderf.f), xmm3);
136 _mm256_store_si256(&(holderi.int_vec), xmm9);
138 for (
i = 0;
i < 8;
i++) {
139 if (holderf.f[
i] > max) {
140 index = holderi.i[
i];
145 for (
i = bound * 8;
i < num_points;
i++, src0++) {
160 #include <pmmintrin.h> 161 #include <xmmintrin.h> 166 const uint32_t num_bytes = num_points * 8;
173 __m128 xmm1, xmm2, xmm3;
174 __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
176 xmm5.
int_vec = _mm_setzero_si128();
177 xmm4.
int_vec = _mm_setzero_si128();
178 holderf.
int_vec = _mm_setzero_si128();
179 holderi.
int_vec = _mm_setzero_si128();
181 int bound = num_bytes >> 5;
184 xmm8 = _mm_setr_epi32(0, 1, 2, 3);
185 xmm9 = _mm_setzero_si128();
186 xmm10 = _mm_setr_epi32(4, 4, 4, 4);
187 xmm3 = _mm_setzero_ps();
189 for (;
i < bound; ++
i) {
190 xmm1 = _mm_load_ps((
float*)src0);
191 xmm2 = _mm_load_ps((
float*)&src0[2]);
195 xmm1 = _mm_mul_ps(xmm1, xmm1);
196 xmm2 = _mm_mul_ps(xmm2, xmm2);
198 xmm1 = _mm_hadd_ps(xmm1, xmm2);
200 xmm3 = _mm_max_ps(xmm1, xmm3);
202 xmm4.
float_vec = _mm_cmplt_ps(xmm1, xmm3);
203 xmm5.
float_vec = _mm_cmpeq_ps(xmm1, xmm3);
205 xmm11 = _mm_and_si128(xmm8, xmm5.
int_vec);
206 xmm12 = _mm_and_si128(xmm9, xmm4.
int_vec);
208 xmm9 = _mm_add_epi32(xmm11, xmm12);
210 xmm8 = _mm_add_epi32(xmm8, xmm10);
213 if (num_bytes >> 4 & 1) {
214 xmm2 = _mm_load_ps((
float*)src0);
219 xmm2 = _mm_mul_ps(xmm2, xmm2);
223 xmm1 = _mm_hadd_ps(xmm2, xmm2);
225 xmm3 = _mm_max_ps(xmm1, xmm3);
227 xmm10 = _mm_setr_epi32(2, 2, 2, 2);
229 xmm4.
float_vec = _mm_cmplt_ps(xmm1, xmm3);
230 xmm5.
float_vec = _mm_cmpeq_ps(xmm1, xmm3);
232 xmm11 = _mm_and_si128(xmm8, xmm5.
int_vec);
233 xmm12 = _mm_and_si128(xmm9, xmm4.
int_vec);
235 xmm9 = _mm_add_epi32(xmm11, xmm12);
237 xmm8 = _mm_add_epi32(xmm8, xmm10);
240 if (num_bytes >> 3 & 1) {
244 xmm2 = _mm_load1_ps(&sq_dist);
248 xmm3 = _mm_max_ss(xmm3, xmm2);
250 xmm4.
float_vec = _mm_cmplt_ps(xmm1, xmm3);
251 xmm5.
float_vec = _mm_cmpeq_ps(xmm1, xmm3);
253 xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
255 xmm11 = _mm_and_si128(xmm8, xmm4.
int_vec);
256 xmm12 = _mm_and_si128(xmm9, xmm5.
int_vec);
258 xmm9 = _mm_add_epi32(xmm11, xmm12);
261 _mm_store_ps((
float*)&(holderf.
f), xmm3);
262 _mm_store_si128(&(holderi.
int_vec), xmm9);
264 target[0] = holderi.
i[0];
265 sq_dist = holderf.
f[0];
266 target[0] = (holderf.
f[1] > sq_dist) ? holderi.
i[1] : target[0];
267 sq_dist = (holderf.
f[1] > sq_dist) ? holderf.
f[1] : sq_dist;
268 target[0] = (holderf.
f[2] > sq_dist) ? holderi.
i[2] : target[0];
269 sq_dist = (holderf.
f[2] > sq_dist) ? holderf.
f[2] : sq_dist;
270 target[0] = (holderf.
f[3] > sq_dist) ? holderi.
i[3] : target[0];
271 sq_dist = (holderf.
f[3] > sq_dist) ? holderf.
f[3] : sq_dist;
276 #ifdef LV_HAVE_GENERIC 280 const uint32_t num_bytes = num_points * 8;
288 for (; i<num_bytes>> 3; ++
i) {
304 #ifndef INCLUDED_volk_32fc_index_max_32u_u_H 305 #define INCLUDED_volk_32fc_index_max_32u_u_H 307 #include <inttypes.h> 313 #include <immintrin.h> 316 volk_32fc_index_max_32u_u_avx2(uint32_t* target,
lv_32fc_t* src0, uint32_t num_points)
318 const uint32_t num_bytes = num_points * 8;
327 __m256 xmm1, xmm2, xmm3;
328 __m256i xmm8, xmm11, xmm12, xmm9, xmm10;
330 xmm5.
int_vec = _mm256_setzero_si256();
331 xmm4.
int_vec = _mm256_setzero_si256();
332 holderf.
int_vec = _mm256_setzero_si256();
333 holderi.
int_vec = _mm256_setzero_si256();
335 int bound = num_bytes >> 6;
338 xmm8 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
339 xmm9 = _mm256_setzero_si256();
340 xmm10 = _mm256_set1_epi32(8);
341 xmm3 = _mm256_setzero_ps();
342 __m256i idx = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7);
344 for (;
i < bound; ++
i) {
345 xmm1 = _mm256_loadu_ps((
float*)src0);
346 xmm2 = _mm256_loadu_ps((
float*)&src0[4]);
350 xmm1 = _mm256_mul_ps(xmm1, xmm1);
351 xmm2 = _mm256_mul_ps(xmm2, xmm2);
353 xmm1 = _mm256_hadd_ps(xmm1, xmm2);
354 xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
356 xmm3 = _mm256_max_ps(xmm1, xmm3);
358 xmm4.
float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
359 xmm5.
float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
361 xmm11 = _mm256_and_si256(xmm8, xmm5.
int_vec);
362 xmm12 = _mm256_and_si256(xmm9, xmm4.
int_vec);
364 xmm9 = _mm256_add_epi32(xmm11, xmm12);
366 xmm8 = _mm256_add_epi32(xmm8, xmm10);
369 _mm256_storeu_ps((
float*)&(holderf.
f), xmm3);
370 _mm256_storeu_si256(&(holderi.
int_vec), xmm9);
372 for (
i = 0;
i < 8;
i++) {
373 if (holderf.
f[
i] > max) {
374 index = holderi.
i[
i];
379 for (
i = bound * 8;
i < num_points;
i++, src0++) {
394 #include <arm_neon.h> 400 unsigned int number = 0;
401 const uint32_t quarter_points = num_points / 4;
404 uint32_t indices[4] = { 0, 1, 2, 3 };
405 const uint32x4_t vec_indices_incr = vdupq_n_u32(4);
406 uint32x4_t vec_indices = vld1q_u32(indices);
407 uint32x4_t vec_max_indices = vec_indices;
410 float max = *src0Ptr;
413 float32x4_t vec_max = vdupq_n_f32(*src0Ptr);
415 for (; number < quarter_points; number++) {
417 const float32x4_t vec_mag2 =
421 const uint32x4_t gt_mask = vcgtq_f32(vec_mag2, vec_max);
422 vec_max = vbslq_f32(gt_mask, vec_mag2, vec_max);
423 vec_max_indices = vbslq_u32(gt_mask, vec_indices, vec_max_indices);
424 vec_indices = vaddq_u32(vec_indices, vec_indices_incr);
426 uint32_t tmp_max_indices[4];
428 vst1q_u32(tmp_max_indices, vec_max_indices);
429 vst1q_f32(tmp_max, vec_max);
431 for (
int i = 0;
i < 4;
i++) {
432 if (tmp_max[
i] > max) {
434 index = tmp_max_indices[
i];
439 for (number = quarter_points * 4; number < num_points; number++) {
440 const float re =
lv_creal(*src0Ptr);
441 const float im =
lv_cimag(*src0Ptr);
442 if ((re * re + im * im) > max) {
static void volk_32fc_index_max_32u_a_sse3(uint32_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_32u.h:164
#define bit128_p(x)
Definition: volk_common.h:142
float f[8]
Definition: volk_common.h:132
__m256i int_vec
Definition: volk_common.h:137
uint32_t i[8]
Definition: volk_common.h:131
__m128i int_vec
Definition: volk_common.h:123
static float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
Definition: volk_neon_intrinsics.h:86
static void volk_32fc_index_max_32u_generic(uint32_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_32u.h:278
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
static void volk_32fc_index_max_32u_neon(uint32_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_32u.h:398
for i
Definition: volk_config_fixed.tmpl.h:25
Definition: volk_common.h:128
__m128 float_vec
Definition: volk_common.h:119
float complex lv_32fc_t
Definition: volk_complex.h:70
__m256 float_vec
Definition: volk_common.h:136
float f[4]
Definition: volk_common.h:115
Definition: volk_common.h:111
#define lv_creal(x)
Definition: volk_complex.h:92
#define lv_cimag(x)
Definition: volk_complex.h:94
uint32_t i[4]
Definition: volk_common.h:114