32 #ifndef INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H
33 #define INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H
35 #ifdef LV_HAVE_GENERIC
38 const float* inputVector,
39 const float lower_bound,
40 const float upper_bound,
41 unsigned int num_points)
43 float* outPtr = outputVector;
45 const float distance = upper_bound - lower_bound;
47 for (inPtr = inputVector; inPtr < inputVector + num_points; inPtr++) {
49 if (
val < lower_bound) {
50 float excess = lower_bound -
val;
51 signed int count = (int)(excess / distance);
52 *outPtr =
val + (count + 1) * distance;
53 }
else if (
val > upper_bound) {
54 float excess =
val - upper_bound;
55 signed int count = (int)(excess / distance);
56 *outPtr =
val - (count + 1) * distance;
66 #include <xmmintrin.h>
69 const float* inputVector,
70 const float lower_bound,
71 const float upper_bound,
72 unsigned int num_points)
74 const __m256 lower = _mm256_set1_ps(lower_bound);
75 const __m256 upper = _mm256_set1_ps(upper_bound);
76 const __m256 distance = _mm256_sub_ps(upper, lower);
78 __m256 is_smaller, is_bigger;
81 const float* inPtr = inputVector;
82 float* outPtr = outputVector;
83 const size_t eight_points = num_points / 8;
84 for (
size_t counter = 0; counter < eight_points; counter++) {
85 input = _mm256_loadu_ps(inPtr);
87 is_smaller = _mm256_cmp_ps(
88 input, lower, _CMP_LT_OQ);
89 is_bigger = _mm256_cmp_ps(
90 input, upper, _CMP_GT_OQ);
92 excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);
94 _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);
96 excess = _mm256_div_ps(excess, distance);
98 excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));
100 adj = _mm256_set1_ps(1.0f);
101 excess = _mm256_add_ps(excess, adj);
103 adj = _mm256_and_ps(adj, is_smaller);
104 adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);
106 excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);
107 output = _mm256_add_ps(input, excess);
108 _mm256_storeu_ps(outPtr, output);
114 outPtr, inPtr, lower_bound, upper_bound, num_points - eight_points * 8);
117 const float* inputVector,
118 const float lower_bound,
119 const float upper_bound,
120 unsigned int num_points)
122 const __m256 lower = _mm256_set1_ps(lower_bound);
123 const __m256 upper = _mm256_set1_ps(upper_bound);
124 const __m256 distance = _mm256_sub_ps(upper, lower);
125 __m256 input, output;
126 __m256 is_smaller, is_bigger;
129 const float* inPtr = inputVector;
130 float* outPtr = outputVector;
131 const size_t eight_points = num_points / 8;
132 for (
size_t counter = 0; counter < eight_points; counter++) {
133 input = _mm256_load_ps(inPtr);
135 is_smaller = _mm256_cmp_ps(
136 input, lower, _CMP_LT_OQ);
137 is_bigger = _mm256_cmp_ps(
138 input, upper, _CMP_GT_OQ);
140 excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);
142 _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);
144 excess = _mm256_div_ps(excess, distance);
146 excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));
148 adj = _mm256_set1_ps(1.0f);
149 excess = _mm256_add_ps(excess, adj);
151 adj = _mm256_and_ps(adj, is_smaller);
152 adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);
154 excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);
155 output = _mm256_add_ps(input, excess);
156 _mm256_store_ps(outPtr, output);
162 outPtr, inPtr, lower_bound, upper_bound, num_points - eight_points * 8);
168 #include <xmmintrin.h>
171 const float* inputVector,
172 const float lower_bound,
173 const float upper_bound,
174 unsigned int num_points)
176 const __m128 lower = _mm_set_ps1(lower_bound);
177 const __m128 upper = _mm_set_ps1(upper_bound);
178 const __m128 distance = _mm_sub_ps(upper, lower);
179 __m128 input, output;
180 __m128 is_smaller, is_bigger;
183 const float* inPtr = inputVector;
184 float* outPtr = outputVector;
185 const size_t quarter_points = num_points / 4;
186 for (
size_t counter = 0; counter < quarter_points; counter++) {
187 input = _mm_loadu_ps(inPtr);
189 is_smaller = _mm_cmplt_ps(input, lower);
190 is_bigger = _mm_cmpgt_ps(input, upper);
192 excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
193 excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
195 excess = _mm_div_ps(excess, distance);
197 excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess));
199 adj = _mm_set_ps1(1.0f);
200 excess = _mm_add_ps(excess, adj);
202 adj = _mm_and_ps(adj, is_smaller);
203 adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
205 excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
206 output = _mm_add_ps(input, excess);
207 _mm_storeu_ps(outPtr, output);
213 outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
216 const float* inputVector,
217 const float lower_bound,
218 const float upper_bound,
219 unsigned int num_points)
221 const __m128 lower = _mm_set_ps1(lower_bound);
222 const __m128 upper = _mm_set_ps1(upper_bound);
223 const __m128 distance = _mm_sub_ps(upper, lower);
224 __m128 input, output;
225 __m128 is_smaller, is_bigger;
228 const float* inPtr = inputVector;
229 float* outPtr = outputVector;
230 const size_t quarter_points = num_points / 4;
231 for (
size_t counter = 0; counter < quarter_points; counter++) {
232 input = _mm_load_ps(inPtr);
234 is_smaller = _mm_cmplt_ps(input, lower);
235 is_bigger = _mm_cmpgt_ps(input, upper);
237 excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
238 excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
240 excess = _mm_div_ps(excess, distance);
243 excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess));
245 adj = _mm_set_ps1(1.0f);
246 excess = _mm_add_ps(excess, adj);
248 adj = _mm_and_ps(adj, is_smaller);
249 adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
251 excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
252 output = _mm_add_ps(input, excess);
253 _mm_store_ps(outPtr, output);
259 outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
264 #include <xmmintrin.h>
267 const float* inputVector,
268 const float lower_bound,
269 const float upper_bound,
270 unsigned int num_points)
272 const __m128 lower = _mm_set_ps1(lower_bound);
273 const __m128 upper = _mm_set_ps1(upper_bound);
274 const __m128 distance = _mm_sub_ps(upper, lower);
275 __m128 input, output;
276 __m128 is_smaller, is_bigger;
280 const float* inPtr = inputVector;
281 float* outPtr = outputVector;
282 const size_t quarter_points = num_points / 4;
283 for (
size_t counter = 0; counter < quarter_points; counter++) {
284 input = _mm_loadu_ps(inPtr);
286 is_smaller = _mm_cmplt_ps(input, lower);
287 is_bigger = _mm_cmpgt_ps(input, upper);
289 excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
290 excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
292 excess = _mm_div_ps(excess, distance);
294 rounddown = _mm_cvttps_epi32(excess);
295 excess = _mm_cvtepi32_ps(rounddown);
297 adj = _mm_set_ps1(1.0f);
298 excess = _mm_add_ps(excess, adj);
300 adj = _mm_and_ps(adj, is_smaller);
301 adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
303 excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
304 output = _mm_add_ps(input, excess);
305 _mm_storeu_ps(outPtr, output);
311 outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
314 const float* inputVector,
315 const float lower_bound,
316 const float upper_bound,
317 unsigned int num_points)
319 const __m128 lower = _mm_set_ps1(lower_bound);
320 const __m128 upper = _mm_set_ps1(upper_bound);
321 const __m128 distance = _mm_sub_ps(upper, lower);
322 __m128 input, output;
323 __m128 is_smaller, is_bigger;
327 const float* inPtr = inputVector;
328 float* outPtr = outputVector;
329 const size_t quarter_points = num_points / 4;
330 for (
size_t counter = 0; counter < quarter_points; counter++) {
331 input = _mm_load_ps(inPtr);
333 is_smaller = _mm_cmplt_ps(input, lower);
334 is_bigger = _mm_cmpgt_ps(input, upper);
336 excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
337 excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
339 excess = _mm_div_ps(excess, distance);
341 rounddown = _mm_cvttps_epi32(excess);
342 excess = _mm_cvtepi32_ps(rounddown);
344 adj = _mm_set_ps1(1.0f);
345 excess = _mm_add_ps(excess, adj);
347 adj = _mm_and_ps(adj, is_smaller);
348 adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
350 excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
351 output = _mm_add_ps(input, excess);
352 _mm_store_ps(outPtr, output);
358 outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
val
Definition: volk_arch_defs.py:57
static void volk_32f_s32f_s32f_mod_range_32f_u_sse(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:266
static void volk_32f_s32f_s32f_mod_range_32f_a_avx(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:116
static void volk_32f_s32f_s32f_mod_range_32f_a_sse(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:313
static void volk_32f_s32f_s32f_mod_range_32f_u_sse2(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:170
static void volk_32f_s32f_s32f_mod_range_32f_generic(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:37
static void volk_32f_s32f_s32f_mod_range_32f_u_avx(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:68
static void volk_32f_s32f_s32f_mod_range_32f_a_sse2(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:215