Vector Optimized Library of Kernels  3.2.0
Architecture-tuned implementations of math kernels
volk_32f_s32f_x2_clamp_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
44 #ifndef INCLUDED_volk_32fc_s32f_x2_clamp_32f_a_H
45 #define INCLUDED_volk_32fc_s32f_x2_clamp_32f_a_H
46 
47 #ifdef LV_HAVE_GENERIC
48 static inline void volk_32f_s32f_x2_clamp_32f_generic(float* out,
49  const float* in,
50  const float min,
51  const float max,
52  unsigned int num_points)
53 {
54  unsigned int number = 0;
55  for (; number < num_points; number++) {
56  if (*in > max) {
57  *out = max;
58  } else if (*in < min) {
59  *out = min;
60  } else {
61  *out = *in;
62  }
63  in++;
64  out++;
65  }
66 }
67 #endif /* LV_HAVE_GENERIC */
68 
69 #if LV_HAVE_AVX2
70 #include <immintrin.h>
71 static inline void volk_32f_s32f_x2_clamp_32f_a_avx2(float* out,
72  const float* in,
73  const float min,
74  const float max,
75  unsigned int num_points)
76 {
77  const __m256 vmin = _mm256_set1_ps(min);
78  const __m256 vmax = _mm256_set1_ps(max);
79 
80  unsigned int number = 0;
81  unsigned int eighth_points = num_points / 8;
82  for (; number < eighth_points; number++) {
83  __m256 res = _mm256_load_ps(in);
84  __m256 max_mask = _mm256_cmp_ps(vmax, res, _CMP_LT_OS);
85  __m256 min_mask = _mm256_cmp_ps(res, vmin, _CMP_LT_OS);
86  res = _mm256_blendv_ps(res, vmax, max_mask);
87  res = _mm256_blendv_ps(res, vmin, min_mask);
88  _mm256_store_ps(out, res);
89  in += 8;
90  out += 8;
91  }
92 
93  number = eighth_points * 8;
94  volk_32f_s32f_x2_clamp_32f_generic(out, in, min, max, num_points - number);
95 }
96 #endif /* LV_HAVE_AVX2 */
97 
98 #if LV_HAVE_SSE4_1
99 #include <immintrin.h>
100 static inline void volk_32f_s32f_x2_clamp_32f_a_sse4_1(float* out,
101  const float* in,
102  const float min,
103  const float max,
104  unsigned int num_points)
105 {
106  const __m128 vmin = _mm_set1_ps(min);
107  const __m128 vmax = _mm_set1_ps(max);
108 
109  unsigned int number = 0;
110  unsigned int quarter_points = num_points / 4;
111  for (; number < quarter_points; number++) {
112  __m128 res = _mm_load_ps(in);
113  __m128 max_mask = _mm_cmplt_ps(vmax, res);
114  __m128 min_mask = _mm_cmplt_ps(res, vmin);
115  res = _mm_blendv_ps(res, vmax, max_mask);
116  res = _mm_blendv_ps(res, vmin, min_mask);
117  _mm_store_ps(out, res);
118  in += 4;
119  out += 4;
120  }
121 
122  number = quarter_points * 4;
123  volk_32f_s32f_x2_clamp_32f_generic(out, in, min, max, num_points - number);
124 }
125 #endif /* LV_HAVE_SSE4_1 */
126 
127 #endif /* INCLUDED_volk_32fc_s32f_x2_clamp_32f_a_H */
128 
129 #ifndef INCLUDED_volk_32fc_s32f_x2_clamp_32f_u_H
130 #define INCLUDED_volk_32fc_s32f_x2_clamp_32f_u_H
131 
132 #if LV_HAVE_AVX2
133 #include <immintrin.h>
134 static inline void volk_32f_s32f_x2_clamp_32f_u_avx2(float* out,
135  const float* in,
136  const float min,
137  const float max,
138  unsigned int num_points)
139 {
140  const __m256 vmin = _mm256_set1_ps(min);
141  const __m256 vmax = _mm256_set1_ps(max);
142 
143  unsigned int number = 0;
144  unsigned int eighth_points = num_points / 8;
145  for (; number < eighth_points; number++) {
146  __m256 res = _mm256_loadu_ps(in);
147  __m256 max_mask = _mm256_cmp_ps(vmax, res, _CMP_LT_OS);
148  __m256 min_mask = _mm256_cmp_ps(res, vmin, _CMP_LT_OS);
149  res = _mm256_blendv_ps(res, vmax, max_mask);
150  res = _mm256_blendv_ps(res, vmin, min_mask);
151  _mm256_storeu_ps(out, res);
152  in += 8;
153  out += 8;
154  }
155 
156  number = eighth_points * 8;
157  volk_32f_s32f_x2_clamp_32f_generic(out, in, min, max, num_points - number);
158 }
159 #endif /* LV_HAVE_AVX2 */
160 
161 #if LV_HAVE_SSE4_1
162 #include <immintrin.h>
163 static inline void volk_32f_s32f_x2_clamp_32f_u_sse4_1(float* out,
164  const float* in,
165  const float min,
166  const float max,
167  unsigned int num_points)
168 {
169  const __m128 vmin = _mm_set1_ps(min);
170  const __m128 vmax = _mm_set1_ps(max);
171 
172  unsigned int number = 0;
173  unsigned int quarter_points = num_points / 4;
174  for (; number < quarter_points; number++) {
175  __m128 res = _mm_loadu_ps(in);
176  __m128 max_mask = _mm_cmplt_ps(vmax, res);
177  __m128 min_mask = _mm_cmplt_ps(res, vmin);
178  res = _mm_blendv_ps(res, vmax, max_mask);
179  res = _mm_blendv_ps(res, vmin, min_mask);
180  _mm_storeu_ps(out, res);
181  in += 4;
182  out += 4;
183  }
184 
185  number = quarter_points * 4;
186  volk_32f_s32f_x2_clamp_32f_generic(out, in, min, max, num_points - number);
187 }
188 #endif /* LV_HAVE_SSE4_1 */
189 
190 #ifdef LV_HAVE_RVV
191 #include <riscv_vector.h>
192 
193 static inline void volk_32f_s32f_x2_clamp_32f_rvv(float* out,
194  const float* in,
195  const float min,
196  const float max,
197  unsigned int num_points)
198 {
199  vfloat32m8_t vmin = __riscv_vfmv_v_f_f32m8(min, __riscv_vsetvlmax_e32m8());
200  vfloat32m8_t vmax = __riscv_vfmv_v_f_f32m8(max, __riscv_vsetvlmax_e32m8());
201  size_t n = num_points;
202  for (size_t vl; n > 0; n -= vl, in += vl, out += vl) {
203  vl = __riscv_vsetvl_e32m8(n);
204  vfloat32m8_t v = __riscv_vle32_v_f32m8(in, vl);
205  v = __riscv_vfmin(__riscv_vfmax(v, vmin, vl), vmax, vl);
206  __riscv_vse32(out, v, vl);
207  }
208 }
209 #endif /*LV_HAVE_RVV*/
210 
211 #endif /* INCLUDED_volk_32fc_s32f_x2_clamp_32f_u_H */
static void volk_32f_s32f_x2_clamp_32f_generic(float *out, const float *in, const float min, const float max, unsigned int num_points)
Definition: volk_32f_s32f_x2_clamp_32f.h:48